From e73624161cbf7dd2e417639b2f9f55c235b29b82 Mon Sep 17 00:00:00 2001 From: pbrod Date: Fri, 12 Feb 2016 13:11:20 +0100 Subject: [PATCH] Simplified wafo.stats: -Deleted obsolete files. -Requires scipy v0.16 -._distn_infrastructure.py monkeypatch scipy.stats._distn_infrastructure.py --- wafo/stats/__init__.py | 110 +- wafo/stats/_binned_statistic.py | 408 -- wafo/stats/_constants.py | 5 +- wafo/stats/_continuous_distns.py | 2054 +++++---- wafo/stats/_discrete_distns.py | 133 +- wafo/stats/_distn_infrastructure.py | 3815 ++--------------- wafo/stats/_distr_params.py | 7 +- wafo/stats/_multivariate.py | 884 ---- wafo/stats/_tukeylambda_stats.py | 201 - wafo/stats/contingency.py | 271 -- wafo/stats/distributions.py | 1 - wafo/stats/estimation.py | 322 +- wafo/stats/kde.py | 541 --- wafo/stats/kde_example.py | 15 - wafo/stats/misc.py | 13 - wafo/stats/morestats.py | 2377 ----------- wafo/stats/mstats.py | 79 - wafo/stats/mstats_basic.py | 2027 --------- wafo/stats/mstats_extras.py | 451 -- wafo/stats/rv.py | 76 - wafo/stats/six.py | 389 -- wafo/stats/stats.py | 4508 -------------------- wafo/stats/tests/common_tests.py | 147 +- wafo/stats/tests/test_binned_statistic.py | 238 -- wafo/stats/tests/test_contingency.py | 202 - wafo/stats/tests/test_continuous_basic.py | 197 +- wafo/stats/tests/test_discrete_basic.py | 91 +- wafo/stats/tests/test_distributions.py | 718 +++- wafo/stats/tests/test_fit.py | 17 +- wafo/stats/tests/test_kdeoth.py | 202 - wafo/stats/tests/test_morestats.py | 1009 ----- wafo/stats/tests/test_mstats_basic.py | 1055 ----- wafo/stats/tests/test_mstats_extras.py | 107 - wafo/stats/tests/test_multivariate.py | 485 --- wafo/stats/tests/test_rank.py | 193 - wafo/stats/tests/test_stats.py | 2830 ------------ wafo/stats/tests/test_tukeylambda_stats.py | 91 - wafo/stats/twolumps.py | 412 -- wafo/stats/vonmises.py | 47 - wafo/stats/vonmises_cython.pyx | 76 - 40 files changed, 2822 insertions(+), 23982 deletions(-) delete mode 100644 wafo/stats/_binned_statistic.py delete mode 100644 wafo/stats/_multivariate.py delete mode 100644 wafo/stats/_tukeylambda_stats.py delete mode 100644 wafo/stats/contingency.py delete mode 100644 wafo/stats/kde.py delete mode 100644 wafo/stats/kde_example.py delete mode 100644 wafo/stats/misc.py delete mode 100644 wafo/stats/morestats.py delete mode 100644 wafo/stats/mstats.py delete mode 100644 wafo/stats/mstats_basic.py delete mode 100644 wafo/stats/mstats_extras.py delete mode 100644 wafo/stats/rv.py delete mode 100644 wafo/stats/six.py delete mode 100644 wafo/stats/stats.py delete mode 100644 wafo/stats/tests/test_binned_statistic.py delete mode 100644 wafo/stats/tests/test_contingency.py delete mode 100644 wafo/stats/tests/test_kdeoth.py delete mode 100644 wafo/stats/tests/test_morestats.py delete mode 100644 wafo/stats/tests/test_mstats_basic.py delete mode 100644 wafo/stats/tests/test_mstats_extras.py delete mode 100644 wafo/stats/tests/test_multivariate.py delete mode 100644 wafo/stats/tests/test_rank.py delete mode 100644 wafo/stats/tests/test_stats.py delete mode 100644 wafo/stats/tests/test_tukeylambda_stats.py delete mode 100644 wafo/stats/twolumps.py delete mode 100644 wafo/stats/vonmises.py delete mode 100644 wafo/stats/vonmises_cython.pyx diff --git a/wafo/stats/__init__.py b/wafo/stats/__init__.py index 35ca55f..e3c5dbb 100644 --- a/wafo/stats/__init__.py +++ b/wafo/stats/__init__.py @@ -8,50 +8,14 @@ Statistical functions (:mod:`scipy.stats`) This module contains a large number of probability distributions as well as a growing library of statistical functions. -Each included distribution is an instance of the class rv_continuous: -For each given name the following methods are available: +Each univariate distribution is an instance of a subclass of `rv_continuous` +(`rv_discrete` for discrete distributions): .. autosummary:: :toctree: generated/ rv_continuous - rv_continuous.pdf - rv_continuous.logpdf - rv_continuous.cdf - rv_continuous.logcdf - rv_continuous.sf - rv_continuous.logsf - rv_continuous.ppf - rv_continuous.isf - rv_continuous.moment - rv_continuous.stats - rv_continuous.entropy - rv_continuous.fit - rv_continuous.expect - -Calling the instance as a function returns a frozen pdf whose shape, -location, and scale parameters are fixed. - -Similarly, each discrete distribution is an instance of the class -rv_discrete: - -.. autosummary:: - :toctree: generated/ - rv_discrete - rv_discrete.rvs - rv_discrete.pmf - rv_discrete.logpmf - rv_discrete.cdf - rv_discrete.logcdf - rv_discrete.sf - rv_discrete.logsf - rv_discrete.ppf - rv_discrete.isf - rv_discrete.stats - rv_discrete.moment - rv_discrete.entropy - rv_discrete.expect Continuous distributions ======================== @@ -65,7 +29,8 @@ Continuous distributions beta -- Beta betaprime -- Beta Prime bradford -- Bradford - burr -- Burr + burr -- Burr (Type III) + burr12 -- Burr (Type XII) cauchy -- Cauchy chi -- Chi chi2 -- Chi-squared @@ -74,6 +39,7 @@ Continuous distributions dweibull -- Double Weibull erlang -- Erlang expon -- Exponential + exponnorm -- Exponentially Modified Normal exponweib -- Exponentiated Weibull exponpow -- Exponential Power f -- F (Snecdor F) @@ -84,6 +50,7 @@ Continuous distributions frechet_r -- Frechet Right Sided, Extreme Value Type II (Extreme LB) or weibull_min frechet_l -- Frechet Left Sided, Weibull_max genlogistic -- Generalized Logistic + gennorm -- Generalized normal genpareto -- Generalized Pareto genexpon -- Generalized Exponential genextreme -- Generalized Extreme Value @@ -98,6 +65,7 @@ Continuous distributions halfcauchy -- Half Cauchy halflogistic -- Half Logistic halfnorm -- Half Normal + halfgennorm -- Generalized Half Normal hypsecant -- Hyperbolic Secant invgamma -- Inverse Gamma invgauss -- Inverse Gaussian @@ -107,6 +75,9 @@ Continuous distributions ksone -- Kolmogorov-Smirnov one-sided (no stats) kstwobign -- Kolmogorov-Smirnov two-sided test for Large N (no stats) laplace -- Laplace + levy -- Levy + levy_l + levy_stable logistic -- Logistic loggamma -- Log-Gamma loglaplace -- Log-Laplace (Log Double Exponential) @@ -130,6 +101,7 @@ Continuous distributions rice -- Rice recipinvgauss -- Reciprocal Inverse Gaussian semicircular -- Semicircular + skewnorm -- Skew normal t -- Student's T triang -- Triangular truncexpon -- Truncated Exponential @@ -137,6 +109,7 @@ Continuous distributions tukeylambda -- Tukey-Lambda uniform -- Uniform vonmises -- Von-Mises (Circular) + vonmises_line -- Von-Mises (Line) wald -- Wald weibull_min -- Minimum Weibull (see Frechet) weibull_max -- Maximum Weibull (see Frechet) @@ -149,7 +122,12 @@ Multivariate distributions :toctree: generated/ multivariate_normal -- Multivariate normal distribution + matrix_normal -- Matrix normal distribution dirichlet -- Dirichlet + wishart -- Wishart + invwishart -- Inverse Wishart + special_ortho_group -- SO(N) group + ortho_group -- O(N) group Discrete distributions ====================== @@ -190,27 +168,28 @@ which work for masked arrays. normaltest -- skew -- Skewness skewtest -- + kstat -- + kstatvar -- tmean -- Truncated arithmetic mean tvar -- Truncated variance tmin -- tmax -- tstd -- tsem -- - nanmean -- Mean, ignoring NaN values - nanstd -- Standard deviation, ignoring NaN values - nanmedian -- Median, ignoring NaN values variation -- Coefficient of variation + find_repeats + trim_mean .. autosummary:: :toctree: generated/ - cumfreq _ - histogram2 _ - histogram _ - itemfreq _ - percentileofscore _ - scoreatpercentile _ - relfreq _ + cumfreq + histogram2 + histogram + itemfreq + percentileofscore + scoreatpercentile + relfreq .. autosummary:: :toctree: generated/ @@ -225,6 +204,7 @@ which work for masked arrays. obrientransform signaltonoise bayes_mvs + mvsdist sem zmap zscore @@ -247,12 +227,14 @@ which work for masked arrays. kendalltau linregress theilslopes + f_value .. autosummary:: :toctree: generated/ ttest_1samp ttest_ind + ttest_ind_from_stats ttest_rel kstest chisquare @@ -265,6 +247,10 @@ which work for masked arrays. wilcoxon kruskal friedmanchisquare + combine_pvalues + ss + square_of_sums + jarque_bera .. autosummary:: :toctree: generated/ @@ -289,6 +275,22 @@ which work for masked arrays. entropy +.. autosummary:: + :toctree: generated/ + + chisqprob + betai + +Circular statistical functions +============================== + +.. autosummary:: + :toctree: generated/ + + circmean + circvar + circstd + Contingency table functions =========================== @@ -335,21 +337,11 @@ interface package rpy. from __future__ import division, print_function, absolute_import from scipy.stats import * from .core import * -from .stats import * from .distributions import * -from .rv import * -from .morestats import * -from ._binned_statistic import * -from .kde import gaussian_kde -from . import mstats -from .contingency import chi2_contingency -from ._multivariate import * from . import estimation #remove vonmises_cython from __all__, I don't know why it is included __all__ = [s for s in dir() if not (s.startswith('_') or s.endswith('cython'))] -#import distributions #@Reimport -#from wafo.stats.distributions import * from numpy.testing import Tester test = Tester().test diff --git a/wafo/stats/_binned_statistic.py b/wafo/stats/_binned_statistic.py deleted file mode 100644 index ff5d353..0000000 --- a/wafo/stats/_binned_statistic.py +++ /dev/null @@ -1,408 +0,0 @@ -from __future__ import division, print_function, absolute_import - -import warnings - -import numpy as np -from scipy._lib.six import callable - - -def binned_statistic(x, values, statistic='mean', - bins=10, range=None): - """ - Compute a binned statistic for a set of data. - - This is a generalization of a histogram function. A histogram divides - the space into bins, and returns the count of the number of points in - each bin. This function allows the computation of the sum, mean, median, - or other statistic of the values within each bin. - - Parameters - ---------- - x : array_like - A sequence of values to be binned. - values : array_like - The values on which the statistic will be computed. This must be - the same shape as `x`. - statistic : string or callable, optional - The statistic to compute (default is 'mean'). - The following statistics are available: - - * 'mean' : compute the mean of values for points within each bin. - Empty bins will be represented by NaN. - * 'median' : compute the median of values for points within each - bin. Empty bins will be represented by NaN. - * 'count' : compute the count of points within each bin. This is - identical to an unweighted histogram. `values` array is not - referenced. - * 'sum' : compute the sum of values for points within each bin. - This is identical to a weighted histogram. - * function : a user-defined function which takes a 1D array of - values, and outputs a single numerical statistic. This function - will be called on the values in each bin. Empty bins will be - represented by function([]), or NaN if this returns an error. - - bins : int or sequence of scalars, optional - If `bins` is an int, it defines the number of equal-width - bins in the given range (10, by default). If `bins` is a sequence, - it defines the bin edges, including the rightmost edge, allowing - for non-uniform bin widths. - range : (float, float) or [(float, float)], optional - The lower and upper range of the bins. If not provided, range - is simply ``(x.min(), x.max())``. Values outside the range are - ignored. - - Returns - ------- - statistic : array - The values of the selected statistic in each bin. - bin_edges : array of dtype float - Return the bin edges ``(length(statistic)+1)``. - binnumber : 1-D ndarray of ints - This assigns to each observation an integer that represents the bin - in which this observation falls. Array has the same length as values. - - See Also - -------- - numpy.histogram, binned_statistic_2d, binned_statistic_dd - - Notes - ----- - All but the last (righthand-most) bin is half-open. In other words, if - `bins` is:: - - [1, 2, 3, 4] - - then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the - second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes* - 4. - - .. versionadded:: 0.11.0 - - Examples - -------- - >>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean', - ... bins=3) - (array([ 1., 2., 4.]), array([ 1., 2., 3., 4.]), array([1, 2, 1, 2, 3])) - - >>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean', bins=3) - (array([ 1., 2., 4.]), array([ 1., 2., 3., 4.]), array([1, 2, 1, 2, 3])) - - """ - try: - N = len(bins) - except TypeError: - N = 1 - - if N != 1: - bins = [np.asarray(bins, float)] - - if range is not None: - if len(range) == 2: - range = [range] - - medians, edges, xy = binned_statistic_dd([x], values, statistic, - bins, range) - - return medians, edges[0], xy - - -def binned_statistic_2d(x, y, values, statistic='mean', - bins=10, range=None): - """ - Compute a bidimensional binned statistic for a set of data. - - This is a generalization of a histogram2d function. A histogram divides - the space into bins, and returns the count of the number of points in - each bin. This function allows the computation of the sum, mean, median, - or other statistic of the values within each bin. - - Parameters - ---------- - x : (N,) array_like - A sequence of values to be binned along the first dimension. - y : (M,) array_like - A sequence of values to be binned along the second dimension. - values : (N,) array_like - The values on which the statistic will be computed. This must be - the same shape as `x`. - statistic : string or callable, optional - The statistic to compute (default is 'mean'). - The following statistics are available: - - * 'mean' : compute the mean of values for points within each bin. - Empty bins will be represented by NaN. - * 'median' : compute the median of values for points within each - bin. Empty bins will be represented by NaN. - * 'count' : compute the count of points within each bin. This is - identical to an unweighted histogram. `values` array is not - referenced. - * 'sum' : compute the sum of values for points within each bin. - This is identical to a weighted histogram. - * function : a user-defined function which takes a 1D array of - values, and outputs a single numerical statistic. This function - will be called on the values in each bin. Empty bins will be - represented by function([]), or NaN if this returns an error. - - bins : int or [int, int] or array-like or [array, array], optional - The bin specification: - - * the number of bins for the two dimensions (nx=ny=bins), - * the number of bins in each dimension (nx, ny = bins), - * the bin edges for the two dimensions (x_edges = y_edges = bins), - * the bin edges in each dimension (x_edges, y_edges = bins). - - range : (2,2) array_like, optional - The leftmost and rightmost edges of the bins along each dimension - (if not specified explicitly in the `bins` parameters): - [[xmin, xmax], [ymin, ymax]]. All values outside of this range will be - considered outliers and not tallied in the histogram. - - Returns - ------- - statistic : (nx, ny) ndarray - The values of the selected statistic in each two-dimensional bin - xedges : (nx + 1) ndarray - The bin edges along the first dimension. - yedges : (ny + 1) ndarray - The bin edges along the second dimension. - binnumber : 1-D ndarray of ints - This assigns to each observation an integer that represents the bin - in which this observation falls. Array has the same length as `values`. - - See Also - -------- - numpy.histogram2d, binned_statistic, binned_statistic_dd - - Notes - ----- - - .. versionadded:: 0.11.0 - - """ - - # This code is based on np.histogram2d - try: - N = len(bins) - except TypeError: - N = 1 - - if N != 1 and N != 2: - xedges = yedges = np.asarray(bins, float) - bins = [xedges, yedges] - - medians, edges, xy = binned_statistic_dd([x, y], values, statistic, - bins, range) - - return medians, edges[0], edges[1], xy - - -def binned_statistic_dd(sample, values, statistic='mean', - bins=10, range=None): - """ - Compute a multidimensional binned statistic for a set of data. - - This is a generalization of a histogramdd function. A histogram divides - the space into bins, and returns the count of the number of points in - each bin. This function allows the computation of the sum, mean, median, - or other statistic of the values within each bin. - - Parameters - ---------- - sample : array_like - Data to histogram passed as a sequence of D arrays of length N, or - as an (N,D) array. - values : array_like - The values on which the statistic will be computed. This must be - the same shape as x. - statistic : string or callable, optional - The statistic to compute (default is 'mean'). - The following statistics are available: - - * 'mean' : compute the mean of values for points within each bin. - Empty bins will be represented by NaN. - * 'median' : compute the median of values for points within each - bin. Empty bins will be represented by NaN. - * 'count' : compute the count of points within each bin. This is - identical to an unweighted histogram. `values` array is not - referenced. - * 'sum' : compute the sum of values for points within each bin. - This is identical to a weighted histogram. - * function : a user-defined function which takes a 1D array of - values, and outputs a single numerical statistic. This function - will be called on the values in each bin. Empty bins will be - represented by function([]), or NaN if this returns an error. - - bins : sequence or int, optional - The bin specification: - - * A sequence of arrays describing the bin edges along each dimension. - * The number of bins for each dimension (nx, ny, ... =bins) - * The number of bins for all dimensions (nx=ny=...=bins). - - range : sequence, optional - A sequence of lower and upper bin edges to be used if the edges are - not given explicitely in `bins`. Defaults to the minimum and maximum - values along each dimension. - - Returns - ------- - statistic : ndarray, shape(nx1, nx2, nx3,...) - The values of the selected statistic in each two-dimensional bin - edges : list of ndarrays - A list of D arrays describing the (nxi + 1) bin edges for each - dimension - binnumber : 1-D ndarray of ints - This assigns to each observation an integer that represents the bin - in which this observation falls. Array has the same length as values. - - See Also - -------- - np.histogramdd, binned_statistic, binned_statistic_2d - - Notes - ----- - - .. versionadded:: 0.11.0 - - """ - if type(statistic) == str: - if statistic not in ['mean', 'median', 'count', 'sum', 'std']: - raise ValueError('unrecognized statistic "%s"' % statistic) - elif callable(statistic): - pass - else: - raise ValueError("statistic not understood") - - # This code is based on np.histogramdd - try: - # Sample is an ND-array. - N, D = sample.shape - except (AttributeError, ValueError): - # Sample is a sequence of 1D arrays. - sample = np.atleast_2d(sample).T - N, D = sample.shape - - nbin = np.empty(D, int) - edges = D * [None] - dedges = D * [None] - - try: - M = len(bins) - if M != D: - raise AttributeError('The dimension of bins must be equal ' - 'to the dimension of the sample x.') - except TypeError: - bins = D * [bins] - - # Select range for each dimension - # Used only if number of bins is given. - if range is None: - smin = np.atleast_1d(np.array(sample.min(0), float)) - smax = np.atleast_1d(np.array(sample.max(0), float)) - else: - smin = np.zeros(D) - smax = np.zeros(D) - for i in np.arange(D): - smin[i], smax[i] = range[i] - - # Make sure the bins have a finite width. - for i in np.arange(len(smin)): - if smin[i] == smax[i]: - smin[i] = smin[i] - .5 - smax[i] = smax[i] + .5 - - # Create edge arrays - for i in np.arange(D): - if np.isscalar(bins[i]): - nbin[i] = bins[i] + 2 # +2 for outlier bins - edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1) - else: - edges[i] = np.asarray(bins[i], float) - nbin[i] = len(edges[i]) + 1 # +1 for outlier bins - dedges[i] = np.diff(edges[i]) - - nbin = np.asarray(nbin) - - # Compute the bin number each sample falls into. - Ncount = {} - for i in np.arange(D): - Ncount[i] = np.digitize(sample[:, i], edges[i]) - - # Using digitize, values that fall on an edge are put in the right bin. - # For the rightmost bin, we want values equal to the right - # edge to be counted in the last bin, and not as an outlier. - for i in np.arange(D): - # Rounding precision - decimal = int(-np.log10(dedges[i].min())) + 6 - # Find which points are on the rightmost edge. - on_edge = np.where(np.around(sample[:, i], decimal) - == np.around(edges[i][-1], decimal))[0] - # Shift these points one bin to the left. - Ncount[i][on_edge] -= 1 - - # Compute the sample indices in the flattened statistic matrix. - ni = nbin.argsort() - xy = np.zeros(N, int) - for i in np.arange(0, D - 1): - xy += Ncount[ni[i]] * nbin[ni[i + 1:]].prod() - xy += Ncount[ni[-1]] - - result = np.empty(nbin.prod(), float) - - if statistic == 'mean': - result.fill(np.nan) - flatcount = np.bincount(xy, None) - flatsum = np.bincount(xy, values) - a = flatcount.nonzero() - result[a] = flatsum[a] / flatcount[a] - elif statistic == 'std': - result.fill(0) - flatcount = np.bincount(xy, None) - flatsum = np.bincount(xy, values) - flatsum2 = np.bincount(xy, values ** 2) - a = flatcount.nonzero() - result[a] = np.sqrt(flatsum2[a] / flatcount[a] - - (flatsum[a] / flatcount[a]) ** 2) - elif statistic == 'count': - result.fill(0) - flatcount = np.bincount(xy, None) - a = np.arange(len(flatcount)) - result[a] = flatcount - elif statistic == 'sum': - result.fill(0) - flatsum = np.bincount(xy, values) - a = np.arange(len(flatsum)) - result[a] = flatsum - elif statistic == 'median': - result.fill(np.nan) - for i in np.unique(xy): - result[i] = np.median(values[xy == i]) - elif callable(statistic): - with warnings.catch_warnings(): - # Numpy generates a warnings for mean/std/... with empty list - warnings.filterwarnings('ignore', category=RuntimeWarning) - old = np.seterr(invalid='ignore') - try: - null = statistic([]) - except: - null = np.nan - np.seterr(**old) - result.fill(null) - for i in np.unique(xy): - result[i] = statistic(values[xy == i]) - - # Shape into a proper matrix - result = result.reshape(np.sort(nbin)) - for i in np.arange(nbin.size): - j = ni.argsort()[i] - result = result.swapaxes(i, j) - ni[i], ni[j] = ni[j], ni[i] - - # Remove outliers (indices 0 and -1 for each dimension). - core = D * [slice(1, -1)] - result = result[core] - - if (result.shape != nbin - 2).any(): - raise RuntimeError('Internal Shape Error') - - return result, edges, xy diff --git a/wafo/stats/_constants.py b/wafo/stats/_constants.py index 4b5048f..552e697 100644 --- a/wafo/stats/_constants.py +++ b/wafo/stats/_constants.py @@ -13,6 +13,10 @@ _EPS = np.finfo(float).eps # The largest [in magnitude] usable floating value. _XMAX = np.finfo(float).machar.xmax +# The log of the largest usable floating value; useful for knowing +# when exp(something) will overflow +_LOGXMAX = np.log(_XMAX) + # The smallest [in magnitude] usable floating value. _XMIN = np.finfo(float).machar.xmin @@ -21,4 +25,3 @@ _EULER = 0.577215664901532860606512090082402431042 # special.zeta(3, 1) Apery's constant _ZETA3 = 1.202056903159594285399738161511449990765 - diff --git a/wafo/stats/_continuous_distns.py b/wafo/stats/_continuous_distns.py index f6c9fd2..504ce8e 100644 --- a/wafo/stats/_continuous_distns.py +++ b/wafo/stats/_continuous_distns.py @@ -12,68 +12,41 @@ from scipy import special from scipy import optimize from scipy import integrate from scipy.special import (gammaln as gamln, gamma as gam, boxcox, boxcox1p, - log1p, expm1) # inv_boxcox, inv_boxcox1p) + inv_boxcox, inv_boxcox1p, erfc, chndtr, chndtrix, + log1p, expm1) -from numpy import (where, arange, putmask, ravel, sum, shape, +from numpy import (where, arange, putmask, ravel, shape, log, sqrt, exp, arctanh, tan, sin, arcsin, arctan, tanh, cos, cosh, sinh) -from numpy import polyval, place, extract, any, asarray, nan, inf, pi +from numpy import polyval, place, extract, asarray, nan, inf, pi import numpy as np -import numpy.random as mtrand +from scipy.stats.mstats_basic import mode try: from scipy.stats import vonmises_cython except: vonmises_cython = None -# try: -# from scipy.stats._tukeylambda_stats import \ -# tukeylambda_variance as _tlvar, \ -# tukeylambda_kurtosis as _tlkurt -# except: -# _tlvar = _tlkurt = None -# from . import vonmises_cython -from ._tukeylambda_stats import (tukeylambda_variance as _tlvar, - tukeylambda_kurtosis as _tlkurt) + +from scipy.stats._tukeylambda_stats import (tukeylambda_variance as _tlvar, + tukeylambda_kurtosis as _tlkurt) from ._distn_infrastructure import ( - rv_continuous, valarray, _skew, _kurtosis, _lazywhere, - _ncx2_log_pdf, _ncx2_pdf, _ncx2_cdf, get_distribution_names) - -from ._constants import _XMIN, _EULER, _ZETA3, _EPS -from .stats import mode -# from .estimation import FitDistribution - -__all__ = [ - 'ksone', 'kstwobign', 'norm', 'alpha', 'anglit', 'arcsine', - 'beta', 'betaprime', 'bradford', 'burr', 'fisk', 'cauchy', - 'chi', 'chi2', 'cosine', 'dgamma', 'dweibull', 'erlang', - 'expon', 'exponweib', 'exponpow', 'fatiguelife', 'foldcauchy', - 'f', 'foldnorm', 'frechet_r', 'weibull_min', 'frechet_l', - 'weibull_max', 'genlogistic', 'genpareto', 'genexpon', 'genextreme', - 'gamma', 'gengamma', 'genhalflogistic', 'gompertz', 'gumbel_r', - 'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant', - 'gausshyper', 'invgamma', 'invgauss', 'invweibull', - 'johnsonsb', 'johnsonsu', 'laplace', 'levy', 'levy_l', - 'levy_stable', 'logistic', 'loggamma', 'loglaplace', 'lognorm', - 'gilbrat', 'maxwell', 'mielke', 'nakagami', 'ncx2', 'ncf', 't', - 'nct', 'pareto', 'lomax', 'pearson3', 'powerlaw', 'powerlognorm', - 'powernorm', 'rdist', 'rayleigh', 'reciprocal', 'rice', - 'truncrayleigh', - 'recipinvgauss', 'semicircular', 'triang', 'truncexpon', - 'truncnorm', 'tukeylambda', 'uniform', 'vonmises', 'vonmises_line', - 'wald', 'wrapcauchy'] - - -# Kolmogorov-Smirnov one-sided and two-sided test statistics -class ksone_gen(rv_continuous): + rv_continuous, valarray, _skew, _kurtosis, # @UnresolvedImport + _lazywhere, _ncx2_log_pdf, _ncx2_pdf, _ncx2_cdf, # @UnresolvedImport + get_distribution_names, # @UnresolvedImport + ) + +from ._constants import _XMIN, _EULER, _ZETA3, _XMAX, _LOGXMAX, _EPS + +## Kolmogorov-Smirnov one-sided and two-sided test statistics +class ksone_gen(rv_continuous): """General Kolmogorov-Smirnov one-sided test. %(default)s """ - def _cdf(self, x, n): return 1.0 - special.smirnov(n, x) @@ -83,13 +56,11 @@ ksone = ksone_gen(a=0.0, name='ksone') class kstwobign_gen(rv_continuous): - """Kolmogorov-Smirnov two-sided test for large N. %(default)s """ - def _cdf(self, x): return 1.0 - special.kolmogorov(x) @@ -97,25 +68,25 @@ class kstwobign_gen(rv_continuous): return special.kolmogorov(x) def _ppf(self, q): - return special.kolmogi(1.0 - q) + return special.kolmogi(1.0-q) kstwobign = kstwobign_gen(a=0.0, name='kstwobign') -# Normal distribution +## Normal distribution # loc = mu, scale = std # Keep these implementations out of the class definition so they can be reused # by other distributions. -_norm_pdf_C = np.sqrt(2 * pi) +_norm_pdf_C = np.sqrt(2*pi) _norm_pdf_logC = np.log(_norm_pdf_C) def _norm_pdf(x): - return exp(-x ** 2 / 2.0) / _norm_pdf_C + return exp(-x**2/2.0) / _norm_pdf_C def _norm_logpdf(x): - return -x ** 2 / 2.0 - _norm_pdf_logC + return -x**2 / 2.0 - _norm_pdf_logC def _norm_cdf(x): @@ -143,7 +114,6 @@ def _norm_isf(q): class norm_gen(rv_continuous): - """A normal continuous random variable. The location (loc) keyword specifies the mean. @@ -157,12 +127,13 @@ class norm_gen(rv_continuous): norm.pdf(x) = exp(-x**2/2)/sqrt(2*pi) + %(after_notes)s + %(example)s """ - def _rvs(self): - return mtrand.standard_normal(self._size) + return self._random_state.standard_normal(self._size) def _pdf(self, x): return _norm_pdf(x) @@ -192,7 +163,7 @@ class norm_gen(rv_continuous): return 0.0, 1.0, 0.0, 0.0 def _entropy(self): - return 0.5 * (log(2 * pi) + 1) + return 0.5*(log(2*pi)+1) @inherit_docstring_from(rv_continuous) def fit(self, data, **kwds): @@ -219,7 +190,7 @@ class norm_gen(rv_continuous): loc = floc if fscale is None: - scale = np.sqrt(((data - loc) ** 2).mean()) + scale = np.sqrt(((data - loc)**2).mean()) else: scale = fscale @@ -229,7 +200,6 @@ norm = norm_gen(name='norm') class alpha_gen(rv_continuous): - """An alpha continuous random variable. %(before_notes)s @@ -242,29 +212,31 @@ class alpha_gen(rv_continuous): where ``Phi(alpha)`` is the normal CDF, ``x > 0``, and ``a > 0``. + `alpha` takes ``a`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _pdf(self, x, a): - return 1.0 / (x ** 2) / special.ndtr(a) * _norm_pdf(a - 1.0 / x) + return 1.0/(x**2)/special.ndtr(a)*_norm_pdf(a-1.0/x) def _logpdf(self, x, a): - return -2 * log(x) + _norm_logpdf(a - 1.0 / x) - log(special.ndtr(a)) + return -2*log(x) + _norm_logpdf(a-1.0/x) - log(special.ndtr(a)) def _cdf(self, x, a): - return special.ndtr(a - 1.0 / x) / special.ndtr(a) + return special.ndtr(a-1.0/x) / special.ndtr(a) def _ppf(self, q, a): - return 1.0 / asarray(a - special.ndtri(q * special.ndtr(a))) + return 1.0/asarray(a-special.ndtri(q*special.ndtr(a))) def _stats(self, a): - return [inf] * 2 + [nan] * 2 + return [inf]*2 + [nan]*2 alpha = alpha_gen(a=0.0, name='alpha') class anglit_gen(rv_continuous): - """An anglit continuous random variable. %(before_notes)s @@ -277,30 +249,29 @@ class anglit_gen(rv_continuous): for ``-pi/4 <= x <= pi/4``. + %(after_notes)s + %(example)s """ - def _pdf(self, x): - return cos(2 * x) + return cos(2*x) def _cdf(self, x): - return sin(x + pi / 4) ** 2.0 + return sin(x+pi/4)**2.0 def _ppf(self, q): - return (arcsin(sqrt(q)) - pi / 4) + return (arcsin(sqrt(q))-pi/4) def _stats(self): - return 0.0, pi * pi / 16 - 0.5, 0.0, -2 * \ - (pi ** 4 - 96) / (pi * pi - 8) ** 2 + return 0.0, pi*pi/16-0.5, 0.0, -2*(pi**4 - 96)/(pi*pi-8)**2 def _entropy(self): - return 1 - log(2) -anglit = anglit_gen(a=-pi / 4, b=pi / 4, name='anglit') + return 1-log(2) +anglit = anglit_gen(a=-pi/4, b=pi/4, name='anglit') class arcsine_gen(rv_continuous): - """An arcsine continuous random variable. %(before_notes)s @@ -313,24 +284,25 @@ class arcsine_gen(rv_continuous): for ``0 < x < 1``. + %(after_notes)s + %(example)s """ - def _pdf(self, x): - return 1.0 / pi / sqrt(x * (1 - x)) + return 1.0/pi/sqrt(x*(1-x)) def _cdf(self, x): - return 2.0 / pi * arcsin(sqrt(x)) + return 2.0/pi*arcsin(sqrt(x)) def _ppf(self, q): - return sin(pi / 2.0 * q) ** 2.0 + return sin(pi/2.0*q)**2.0 def _stats(self): mu = 0.5 - mu2 = 1.0 / 8 + mu2 = 1.0/8 g1 = 0 - g2 = -3.0 / 2.0 + g2 = -3.0/2.0 return mu, mu2, g1, g2 def _entropy(self): @@ -342,7 +314,6 @@ class FitDataError(ValueError): # This exception is raised by, for example, beta_gen.fit when both floc # and fscale are fixed and there are values in the data not in the open # interval (floc, floc+fscale). - def __init__(self, distr, lower, upper): self.args = ( "Invalid values in `data`. Maximum likelihood " @@ -355,7 +326,6 @@ class FitDataError(ValueError): class FitSolverError(RuntimeError): # This exception is raised by, for example, beta_gen.fit when # optimize.fsolve returns with ier != 1. - def __init__(self, mesg): emsg = "Solver for the MLE equations failed to converge: " emsg += mesg.replace('\n', '') @@ -386,7 +356,6 @@ def _beta_mle_ab(theta, n, s1, s2): class beta_gen(rv_continuous): - """A beta continuous random variable. %(before_notes)s @@ -402,18 +371,21 @@ class beta_gen(rv_continuous): for ``0 < x < 1``, ``a > 0``, ``b > 0``, where ``gamma(z)`` is the gamma function (`scipy.special.gamma`). + `beta` takes ``a`` and ``b`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _rvs(self, a, b): - return mtrand.beta(a, b, self._size) + return self._random_state.beta(a, b, self._size) def _pdf(self, x, a, b): return np.exp(self._logpdf(x, a, b)) def _logpdf(self, x, a, b): - lPx = special.xlog1py(b - 1.0, -x) + special.xlogy(a - 1.0, x) + lPx = special.xlog1py(b-1.0, -x) + special.xlogy(a-1.0, x) lPx -= special.betaln(a, b) return lPx @@ -424,12 +396,11 @@ class beta_gen(rv_continuous): return special.btdtri(a, b, q) def _stats(self, a, b): - mn = a * 1.0 / (a + b) - var = (a * b * 1.0) / (a + b + 1.0) / (a + b) ** 2.0 - g1 = 2.0 * (b - a) * sqrt((1.0 + a + b) / (a * b)) / (2 + a + b) - g2 = 6.0 * (a ** 3 + a ** 2 * (1 - 2 * b) + b ** - 2 * (1 + b) - 2 * a * b * (2 + b)) - g2 /= a * b * (a + b + 2) * (a + b + 3) + mn = a*1.0 / (a + b) + var = (a*b*1.0)/(a+b+1.0)/(a+b)**2.0 + g1 = 2.0*(b-a)*sqrt((1.0+a+b)/(a*b)) / (2+a+b) + g2 = 6.0*(a**3 + a**2*(1-2*b) + b**2*(1+b) - 2*a*b*(2+b)) + g2 /= a*b*(a+b+2)*(a+b+3) return mn, var, g1, g2 def _fitstart(self, data): @@ -438,12 +409,11 @@ class beta_gen(rv_continuous): def func(x): a, b = x - sk = 2 * (b - a) * sqrt(a + b + 1) / (a + b + 2) / sqrt(a * b) - ku = a ** 3 - a ** 2 * \ - (2 * b - 1) + b ** 2 * (b + 1) - 2 * a * b * (b + 2) - ku /= a * b * (a + b + 2) * (a + b + 3) + sk = 2*(b-a)*sqrt(a + b + 1) / (a + b + 2) / sqrt(a*b) + ku = a**3 - a**2*(2*b-1) + b**2*(b+1) - 2*a*b*(b+2) + ku /= a*b*(a+b+2)*(a+b+3) ku *= 6 - return [sk - g1, ku - g2] + return [sk-g1, ku-g2] a, b = optimize.fsolve(func, (1.0, 1.0)) return super(beta_gen, self)._fitstart(data, args=(a, b)) @@ -457,8 +427,10 @@ class beta_gen(rv_continuous): # Override rv_continuous.fit, so we can more efficiently handle the # case where floc and fscale are given. - f0 = kwds.get('f0', None) - f1 = kwds.get('f1', None) + f0 = (kwds.get('f0', None) or kwds.get('fa', None) or + kwds.get('fix_a', None)) + f1 = (kwds.get('f1', None) or kwds.get('fb', None) or + kwds.get('fix_b', None)) floc = kwds.get('floc', None) fscale = kwds.get('fscale', None) @@ -503,9 +475,11 @@ class beta_gen(rv_continuous): a = b * xbar / (1 - xbar) # Compute the MLE for `a` by solving _beta_mle_a. - theta, _info, ier, mesg = optimize.fsolve( - _beta_mle_a, a, args=(b, len(data), np.log(data).sum()), - full_output=True) + theta, info, ier, mesg = optimize.fsolve( + _beta_mle_a, a, + args=(b, len(data), np.log(data).sum()), + full_output=True + ) if ier != 1: raise FitSolverError(mesg=mesg) a = theta[0] @@ -530,9 +504,11 @@ class beta_gen(rv_continuous): b = (1 - xbar) * fac # Compute the MLE for a and b by solving _beta_mle_ab. - theta, _info, ier, mesg = optimize.fsolve( - _beta_mle_ab, [a, b], args=(len(data), s1, s2), - full_output=True) + theta, info, ier, mesg = optimize.fsolve( + _beta_mle_ab, [a, b], + args=(len(data), s1, s2), + full_output=True + ) if ier != 1: raise FitSolverError(mesg=mesg) a, b = theta @@ -543,7 +519,6 @@ beta = beta_gen(a=0.0, b=1.0, name='beta') class betaprime_gen(rv_continuous): - """A beta prime continuous random variable. %(before_notes)s @@ -557,27 +532,28 @@ class betaprime_gen(rv_continuous): for ``x > 0``, ``a > 0``, ``b > 0``, where ``beta(a, b)`` is the beta function (see `scipy.special.beta`). + `betaprime` takes ``a`` and ``b`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _rvs(self, a, b): - u1 = gamma.rvs(a, size=self._size) - u2 = gamma.rvs(b, size=self._size) + sz, rndm = self._size, self._random_state + u1 = gamma.rvs(a, size=sz, random_state=rndm) + u2 = gamma.rvs(b, size=sz, random_state=rndm) return (u1 / u2) def _pdf(self, x, a, b): return np.exp(self._logpdf(x, a, b)) def _logpdf(self, x, a, b): - return (special.xlogy(a - 1.0, x) - special.xlog1py(a + b, x) - + return (special.xlogy(a-1.0, x) - special.xlog1py(a+b, x) - special.betaln(a, b)) def _cdf(self, x, a, b): - return special.betainc(a, b, x / (1. + x)) - # remove for now: special.hyp2f1 is incorrect for large a - # x = where(x == 1.0, 1.0-1e-6, x) - # return pow(x, a)*special.hyp2f1(a+b, a, 1+a, -x)/a/special.beta(a, b) + return special.betainc(a, b, x/(1.+x)) def _fitstart(self, data): g1 = np.mean(data) @@ -593,26 +569,22 @@ class betaprime_gen(rv_continuous): def _munp(self, n, a, b): if (n == 1.0): - return where(b > 1, a / (b - 1.0), inf) + return where(b > 1, a/(b-1.0), inf) elif (n == 2.0): - return where(b > 2, a * (a + 1.0) / ((b - 2.0) * (b - 1.0)), inf) + return where(b > 2, a*(a+1.0)/((b-2.0)*(b-1.0)), inf) elif (n == 3.0): - return where(b > 3, - a * (a + 1.0) * (a + 2.0) / - ((b - 3.0) * (b - 2.0) * (b - 1.0)), + return where(b > 3, a*(a+1.0)*(a+2.0)/((b-3.0)*(b-2.0)*(b-1.0)), inf) elif (n == 4.0): return where(b > 4, - a * (a + 1.0) * (a + 2.0) * (a + 3.0) / - ((b - 4.0) * (b - 3.0) * (b - 2.0) * (b - 1.0)), - inf) + a*(a+1.0)*(a+2.0)*(a+3.0)/((b-4.0)*(b-3.0) + * (b-2.0)*(b-1.0)), inf) else: raise NotImplementedError betaprime = betaprime_gen(a=0.0, name='betaprime') class bradford_gen(rv_continuous): - """A Bradford continuous random variable. %(before_notes)s @@ -625,10 +597,13 @@ class bradford_gen(rv_continuous): for ``0 < x < 1``, ``c > 0`` and ``k = log(1+c)``. + `bradford` takes ``c`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _pdf(self, x, c): return c / (c * x + 1.0) / log1p(c) @@ -636,28 +611,26 @@ class bradford_gen(rv_continuous): return log1p(c * x) / log1p(c) def _ppf(self, q, c): - return ((1.0 + c) ** q - 1) / c + return ((1.0+c)**q-1)/c def _stats(self, c, moments='mv'): k = log1p(c) - mu = (c - k) / (c * k) - mu2 = ((c + 2.0) * k - 2.0 * c) / (2 * c * k * k) + mu = (c-k)/(c*k) + mu2 = ((c+2.0)*k-2.0*c)/(2*c*k*k) g1 = None g2 = None if 's' in moments: - g1 = (sqrt(2) * (12 * c * c - 9 * c * k * (c + 2) + - 2 * k * k * (c * (c + 3) + 3))) - g1 /= sqrt(c * (c * (k - 2) + 2 * k)) * (3 * c * (k - 2) + 6 * k) + g1 = sqrt(2)*(12*c*c-9*c*k*(c+2)+2*k*k*(c*(c+3)+3)) + g1 /= sqrt(c*(c*(k-2)+2*k))*(3*c*(k-2)+6*k) if 'k' in moments: - g2 = (c ** 3 * (k - 3) * (k * (3 * k - 16) + 24) + - 12 * k * c * c * (k - 4) * (k - 3) - + 6 * c * k * k * (3 * k - 14) + 12 * k ** 3) - g2 /= 3 * c * (c * (k - 2) + 2 * k) ** 2 + g2 = (c**3*(k-3)*(k*(3*k-16)+24)+12*k*c*c*(k-4)*(k-3) + + 6*c*k*k*(3*k-14) + 12*k**3) + g2 /= 3*c*(c*(k-2)+2*k)**2 return mu, mu2, g1, g2 def _entropy(self, c): k = log1p(c) - return k / 2.0 - log(c / k) + return k/2.0 - log(c/k) def _fitstart(self, data): loc = data.min() - 1e-4 @@ -671,14 +644,14 @@ bradford = bradford_gen(a=0.0, b=1.0, name='bradford') class burr_gen(rv_continuous): - - """A Burr continuous random variable. + """A Burr (Type III) continuous random variable. %(before_notes)s See Also -------- - fisk : a special case of `burr` with ``d = 1`` + fisk : a special case of either `burr` or ``burr12`` with ``d = 1`` + burr12 : Burr Type XII distribution Notes ----- @@ -688,19 +661,29 @@ class burr_gen(rv_continuous): for ``x > 0``. + `burr` takes ``c`` and ``d`` as shape parameters. + + This is the PDF corresponding to the third CDF given in Burr's list; + specifically, it is equation (11) in Burr's paper [1]_. + + %(after_notes)s + + References + ---------- + .. [1] Burr, I. W. "Cumulative frequency functions", Annals of + Mathematical Statistics, 13(2), pp 215-232 (1942). + %(example)s """ - def _pdf(self, x, c, d): - return c * d * (x ** (-c - 1.0)) * \ - ((1 + x ** (-c * 1.0)) ** (-d - 1.0)) + return c * d * (x**(-c - 1.0)) * ((1 + x**(-c))**(-d - 1.0)) def _cdf(self, x, c, d): - return (1 + x ** (-c * 1.0)) ** (-d ** 1.0) + return (1 + x**(-c))**(-d) def _ppf(self, q, c, d): - return (q ** (-1.0 / d) - 1) ** (-1.0 / c) + return (q**(-1.0/d) - 1)**(-1.0/c) def _munp(self, n, c, d): nc = 1. * n / c @@ -708,15 +691,93 @@ class burr_gen(rv_continuous): burr = burr_gen(a=0.0, name='burr') -class fisk_gen(burr_gen): +class burr12_gen(rv_continuous): + """A Burr (Type XII) continuous random variable. + + %(before_notes)s + + See Also + -------- + fisk : a special case of either `burr` or ``burr12`` with ``d = 1`` + burr : Burr Type III distribution + + Notes + ----- + The probability density function for `burr` is:: + + burr12.pdf(x, c, d) = c * d * x**(c-1) * (1+x**(c))**(-d-1) + + for ``x > 0``. + + `burr12` takes ``c`` and ``d`` as shape parameters. + + This is the PDF corresponding to the twelfth CDF given in Burr's list; + specifically, it is equation (20) in Burr's paper [1]_. + + %(after_notes)s + + The Burr type 12 distribution is also sometimes referred to as + the Singh-Maddala distribution from NIST [2]_. + References + ---------- + .. [1] Burr, I. W. "Cumulative frequency functions", Annals of + Mathematical Statistics, 13(2), pp 215-232 (1942). + + .. [2] http://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/b12pdf.htm + + %(example)s + + """ + def _pdf(self, x, c, d): + return np.exp(self._logpdf(x, c, d)) + + def _logpdf(self, x, c, d): + return log(c) + log(d) + special.xlogy(c-1, x) + special.xlog1py(-d-1, x**c) + + def _cdf(self, x, c, d): + return 1 - self._sf(x, c, d) + + def _logcdf(self, x, c, d): + return special.log1p(-(1 + x**c)**(-d)) + + def _sf(self, x, c, d): + return np.exp(self._logsf(x, c, d)) + + def _logsf(self, x, c, d): + return special.xlog1py(-d, x**c) + + def _ppf(self, q, c, d): + return ((1 - q)**(-1.0/d) - 1)**(1.0/c) + + def _munp(self, n, c, d): + nc = 1. * n / c + return d * special.beta(1.0 + nc, d - nc) +burr12 = burr12_gen(a=0.0, name='burr12') + + +class fisk_gen(burr_gen): """A Fisk continuous random variable. The Fisk distribution is also known as the log-logistic distribution, and equals the Burr distribution with ``d == 1``. + `fisk` takes ``c`` as a shape parameter. + %(before_notes)s + Notes + ----- + The probability density function for `fisk` is:: + + fisk.pdf(x, c) = c * x**(-c-1) * (1 + x**(-c))**(-2) + + for ``x > 0``. + + `fisk` takes ``c`` as a shape parameters. + + %(after_notes)s + See Also -------- burr @@ -724,7 +785,6 @@ class fisk_gen(burr_gen): %(example)s """ - def _pdf(self, x, c): return burr_gen._pdf(self, x, c, 1.0) @@ -744,7 +804,6 @@ fisk = fisk_gen(a=0.0, name='fisk') # median = loc class cauchy_gen(rv_continuous): - """A Cauchy continuous random variable. %(before_notes)s @@ -755,38 +814,40 @@ class cauchy_gen(rv_continuous): cauchy.pdf(x) = 1 / (pi * (1 + x**2)) + %(after_notes)s + %(example)s """ - def _pdf(self, x): - return 1.0 / pi / (1.0 + x * x) + return 1.0/pi/(1.0+x*x) def _cdf(self, x): - return 0.5 + 1.0 / pi * arctan(x) + return 0.5 + 1.0/pi*arctan(x) def _ppf(self, q): - return tan(pi * q - pi / 2.0) + return tan(pi*q-pi/2.0) def _sf(self, x): - return 0.5 - 1.0 / pi * arctan(x) + return 0.5 - 1.0/pi*arctan(x) def _isf(self, q): - return tan(pi / 2.0 - pi * q) + return tan(pi/2.0-pi*q) def _stats(self): - return inf, inf, nan, nan + return nan, nan, nan, nan def _entropy(self): - return log(4 * pi) + return log(4*pi) def _fitstart(self, data, args=None): - return (0, 1) + # Initialize ML guesses using quartiles instead of moments. + p25, p50, p75 = np.percentile(data, [25, 50, 75]) + return p50, (p75 - p25)/2 cauchy = cauchy_gen(name='cauchy') class chi_gen(rv_continuous): - """A chi continuous random variable. %(before_notes)s @@ -805,29 +866,36 @@ class chi_gen(rv_continuous): - ``chi(2, 0, scale)`` is equivalent to `rayleigh` - ``chi(3, 0, scale)`` is equivalent to `maxwell` + `chi` takes ``df`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _rvs(self, df): - return sqrt(chi2.rvs(df, size=self._size)) + sz, rndm = self._size, self._random_state + return sqrt(chi2.rvs(df, size=sz, random_state=rndm)) def _pdf(self, x, df): - return x ** (df - 1.) * exp(-x * x * 0.5) / \ - (2.0) ** (df * 0.5 - 1) / gam(df * 0.5) + return np.exp(self._logpdf(x, df)) + + def _logpdf(self, x, df): + l = np.log(2) - .5*np.log(2)*df - special.gammaln(.5*df) + return l + special.xlogy(df-1.,x) - .5*x**2 def _cdf(self, x, df): - return special.gammainc(df * 0.5, 0.5 * x * x) + return special.gammainc(.5*df, .5*x**2) def _ppf(self, q, df): - return sqrt(2 * special.gammaincinv(df * 0.5, q)) + return sqrt(2*special.gammaincinv(.5*df, q)) def _stats(self, df): - mu = sqrt(2) * special.gamma(df / 2.0 + 0.5) / special.gamma(df / 2.0) - mu2 = df - mu * mu - g1 = (2 * mu ** 3.0 + mu * (1 - 2 * df)) / asarray(np.power(mu2, 1.5)) - g2 = 2 * df * (1.0 - df) - 6 * mu ** 4 + 4 * mu ** 2 * (2 * df - 1) - g2 /= asarray(mu2 ** 2.0) + mu = sqrt(2)*special.gamma(df/2.0+0.5)/special.gamma(df/2.0) + mu2 = df - mu*mu + g1 = (2*mu**3.0 + mu*(1-2*df))/asarray(np.power(mu2, 1.5)) + g2 = 2*df*(1.0-df)-6*mu**4 + 4*mu**2 * (2*df-1) + g2 /= asarray(mu2**2.0) return mu, mu2, g1, g2 def _fitstart(self, data): @@ -839,9 +907,8 @@ class chi_gen(rv_continuous): chi = chi_gen(a=0.0, name='chi') -# Chi-squared (gamma-distributed with loc=0 and scale=2 and shape=df/2) +## Chi-squared (gamma-distributed with loc=0 and scale=2 and shape=df/2) class chi2_gen(rv_continuous): - """A chi-squared continuous random variable. %(before_notes)s @@ -852,19 +919,21 @@ class chi2_gen(rv_continuous): chi2.pdf(x, df) = 1 / (2*gamma(df/2)) * (x/2)**(df/2-1) * exp(-x/2) + `chi2` takes ``df`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _rvs(self, df): - return mtrand.chisquare(df, self._size) + return self._random_state.chisquare(df, self._size) def _pdf(self, x, df): return exp(self._logpdf(x, df)) def _logpdf(self, x, df): - return special.xlogy(df / 2. - 1, x) - x / 2. - \ - gamln(df / 2.) - (log(2) * df) / 2. + return special.xlogy(df/2.-1, x) - x/2. - gamln(df/2.) - (log(2)*df)/2. def _cdf(self, x, df): return special.chdtr(df, x) @@ -876,13 +945,13 @@ class chi2_gen(rv_continuous): return special.chdtri(df, p) def _ppf(self, p, df): - return self._isf(1.0 - p, df) + return self._isf(1.0-p, df) def _stats(self, df): mu = df - mu2 = 2 * df - g1 = 2 * sqrt(2.0 / df) - g2 = 12.0 / df + mu2 = 2*df + g1 = 2*sqrt(2.0/df) + g2 = 12.0/df return mu, mu2, g1, g2 def _fitstart(self, data): @@ -895,7 +964,6 @@ chi2 = chi2_gen(a=0.0, name='chi2') class cosine_gen(rv_continuous): - """A cosine continuous random variable. %(before_notes)s @@ -909,27 +977,26 @@ class cosine_gen(rv_continuous): for ``-pi <= x <= pi``. + %(after_notes)s + %(example)s """ - def _pdf(self, x): - return 1.0 / 2 / pi * (1 + cos(x)) + return 1.0/2/pi*(1+cos(x)) def _cdf(self, x): - return 1.0 / 2 / pi * (pi + x + sin(x)) + return 1.0/2/pi*(pi + x + sin(x)) def _stats(self): - return 0.0, pi * pi / 3.0 - 2.0, 0.0, -6.0 * \ - (pi ** 4 - 90) / (5.0 * (pi * pi - 6) ** 2) + return 0.0, pi*pi/3.0-2.0, 0.0, -6.0*(pi**4-90)/(5.0*(pi*pi-6)**2) def _entropy(self): - return log(4 * pi) - 1.0 + return log(4*pi)-1.0 cosine = cosine_gen(a=-pi, b=pi, name='cosine') class dgamma_gen(rv_continuous): - """A double gamma continuous random variable. %(before_notes)s @@ -942,42 +1009,46 @@ class dgamma_gen(rv_continuous): for ``a > 0``. + `dgamma` takes ``a`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _rvs(self, a): - u = mtrand.random_sample(size=self._size) - return (gamma.rvs(a, size=self._size) * where(u >= 0.5, 1, -1)) + sz, rndm = self._size, self._random_state + u = rndm.random_sample(size=sz) + gm = gamma.rvs(a, size=sz, random_state=rndm) + return gm * where(u >= 0.5, 1, -1) def _pdf(self, x, a): ax = abs(x) - return 1.0 / (2 * special.gamma(a)) * ax ** (a - 1.0) * exp(-ax) + return 1.0/(2*special.gamma(a))*ax**(a-1.0) * exp(-ax) def _logpdf(self, x, a): ax = abs(x) - return special.xlogy(a - 1.0, ax) - ax - log(2) - gamln(a) + return special.xlogy(a-1.0, ax) - ax - log(2) - gamln(a) def _cdf(self, x, a): - fac = 0.5 * special.gammainc(a, abs(x)) + fac = 0.5*special.gammainc(a, abs(x)) return where(x > 0, 0.5 + fac, 0.5 - fac) def _sf(self, x, a): - fac = 0.5 * special.gammainc(a, abs(x)) - return where(x > 0, 0.5 - fac, 0.5 + fac) + fac = 0.5*special.gammainc(a, abs(x)) + return where(x > 0, 0.5-fac, 0.5+fac) def _ppf(self, q, a): - fac = special.gammainccinv(a, 1 - abs(2 * q - 1)) + fac = special.gammainccinv(a, 1-abs(2*q-1)) return where(q > 0.5, fac, -fac) def _stats(self, a): - mu2 = a * (a + 1.0) - return 0.0, mu2, 0.0, (a + 2.0) * (a + 3.0) / mu2 - 3.0 + mu2 = a*(a+1.0) + return 0.0, mu2, 0.0, (a+2.0)*(a+3.0)/mu2-3.0 dgamma = dgamma_gen(name='dgamma') class dweibull_gen(rv_continuous): - """A double Weibull continuous random variable. %(before_notes)s @@ -988,25 +1059,30 @@ class dweibull_gen(rv_continuous): dweibull.pdf(x, c) = c / 2 * abs(x)**(c-1) * exp(-abs(x)**c) + `dweibull` takes ``d`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _rvs(self, c): - u = mtrand.random_sample(size=self._size) - return weibull_min.rvs(c, size=self._size) * (where(u >= 0.5, 1, -1)) + sz, rndm = self._size, self._random_state + u = rndm.random_sample(size=sz) + w = weibull_min.rvs(c, size=sz, random_state=rndm) + return w * (where(u >= 0.5, 1, -1)) def _pdf(self, x, c): ax = abs(x) - Px = c / 2.0 * ax ** (c - 1.0) * exp(-ax ** c) + Px = c / 2.0 * ax**(c-1.0) * exp(-ax**c) return Px def _logpdf(self, x, c): ax = abs(x) - return log(c) - log(2.0) + special.xlogy(c - 1.0, ax) - ax ** c + return log(c) - log(2.0) + special.xlogy(c - 1.0, ax) - ax**c def _cdf(self, x, c): - Cx1 = 0.5 * exp(-abs(x) ** c) + Cx1 = 0.5 * exp(-abs(x)**c) return where(x > 0, 1 - Cx1, Cx1) def _ppf(self, q, c): @@ -1025,9 +1101,8 @@ class dweibull_gen(rv_continuous): dweibull = dweibull_gen(name='dweibull') -# Exponential (gamma distributed with a=1.0, loc=loc and scale=scale) +## Exponential (gamma distributed with a=1.0, loc=loc and scale=scale) class expon_gen(rv_continuous): - """An exponential continuous random variable. %(before_notes)s @@ -1036,13 +1111,15 @@ class expon_gen(rv_continuous): ----- The probability density function for `expon` is:: - expon.pdf(x) = lambda * exp(- lambda*x) + expon.pdf(x) = exp(-x) for ``x >= 0``. - The scale parameter is equal to ``scale = 1.0 / lambda``. + %(after_notes)s - `expon` does not have shape parameters. + A common parameterization for `expon` is in terms of the rate parameter + ``lambda``, such that ``pdf = lambda * exp(-lambda * x)``. This + parameterization corresponds to using ``scale = 1 / lambda``. %(example)s @@ -1057,7 +1134,7 @@ class expon_gen(rv_continuous): raise IndexError('Index to the fixed parameter is out of bounds') def _rvs(self): - return mtrand.standard_exponential(self._size) + return self._random_state.standard_exponential(self._size) def _pdf(self, x): return exp(-x) @@ -1066,10 +1143,10 @@ class expon_gen(rv_continuous): return -x def _cdf(self, x): - return -expm1(-x) + return -special.expm1(-x) def _ppf(self, q): - return -log1p(-q) + return -special.log1p(-q) def _sf(self, x): return exp(-x) @@ -1088,8 +1165,79 @@ class expon_gen(rv_continuous): expon = expon_gen(a=0.0, name='expon') -class exponweib_gen(rv_continuous): +## Exponentially Modified Normal (exponential distribution +## convolved with a Normal). +## This is called an exponentially modified gaussian on wikipedia +class exponnorm_gen(rv_continuous): + """An exponentially modified Normal continuous random variable. + + %(before_notes)s + Notes + ----- + The probability density function for `exponnorm` is:: + + exponnorm.pdf(x, K) = 1/(2*K) exp(1/(2 * K**2)) exp(-x / K) * erfc(-(x - 1/K) / sqrt(2)) + + where the shape parameter ``K > 0``. + + It can be thought of as the sum of a normally distributed random + value with mean ``loc`` and sigma ``scale`` and an exponentially + distributed random number with a pdf proportional to ``exp(-lambda * x)`` + where ``lambda = (K * scale)**(-1)``. + + %(after_notes)s + + An alternative parameterization of this distribution (for example, in + `Wikipedia `_) + involves three parameters, :math:`\mu`, :math:`\lambda` and :math:`\sigma`. + In the present parameterization this corresponds to having ``loc`` and + ``scale`` equal to :math:`\mu` and :math:`\sigma`, respectively, and + shape parameter :math:`K = 1/\sigma\lambda`. + + .. versionadded:: 0.16.0 + + %(example)s + + """ + def _rvs(self, K): + expval = self._random_state.standard_exponential(self._size) * K + gval = self._random_state.standard_normal(self._size) + return expval + gval + + def _pdf(self, x, K): + invK = 1.0 / K + exparg = 0.5 * invK**2 - invK * x + # Avoid overflows; setting exp(exparg) to the max float works + # all right here + expval = _lazywhere(exparg < _LOGXMAX, (exparg,), exp, _XMAX) + return 0.5 * invK * expval * erfc(-(x - invK) / sqrt(2)) + + def _logpdf(self, x, K): + invK = 1.0 / K + exparg = 0.5 * invK**2 - invK * x + return exparg + log(0.5 * invK * erfc(-(x - invK) / sqrt(2))) + + def _cdf(self, x, K): + invK = 1.0 / K + expval = invK * (0.5 * invK - x) + return special.ndtr(x) - exp(expval) * special.ndtr(x - invK) + + def _sf(self, x, K): + invK = 1.0 / K + expval = invK * (0.5 * invK - x) + return special.ndtr(-x) + exp(expval) * special.ndtr(x - invK) + + def _stats(self, K): + K2 = K * K + opK2 = 1.0 + K2 + skw = 2 * K**3 * opK2**(-1.5) + krt = 6.0 * K2 * K2 * opK2**(-2) + return K, opK2, skw, krt +exponnorm = exponnorm_gen(name='exponnorm') + + +class exponweib_gen(rv_continuous): """An exponentiated Weibull continuous random variable. %(before_notes)s @@ -1103,31 +1251,33 @@ class exponweib_gen(rv_continuous): for ``x > 0``, ``a > 0``, ``c > 0``. + `exponweib` takes ``a`` and ``c`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _pdf(self, x, a, c): return exp(self._logpdf(x, a, c)) def _logpdf(self, x, a, c): - negxc = -x ** c + negxc = -x**c exm1c = -special.expm1(negxc) logp = (log(a) + log(c) + special.xlogy(a - 1.0, exm1c) + negxc + special.xlogy(c - 1.0, x)) return logp def _cdf(self, x, a, c): - exm1c = -expm1(-x ** c) - return (exm1c) ** a + exm1c = -special.expm1(-x**c) + return exm1c**a def _ppf(self, q, a, c): - return (-log1p(-q ** (1.0 / a))) ** asarray(1.0 / c) + return (-special.log1p(-q**(1.0/a)))**asarray(1.0/c) exponweib = exponweib_gen(a=0.0, name='exponweib') class exponpow_gen(rv_continuous): - """An exponential power continuous random variable. %(before_notes)s @@ -1142,6 +1292,10 @@ class exponpow_gen(rv_continuous): from the exponential power distribution that is also known under the names "generalized normal" or "generalized Gaussian". + `exponpow` takes ``b`` as a shape parameter. + + %(after_notes)s + References ---------- http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Exponentialpower.pdf @@ -1149,31 +1303,29 @@ class exponpow_gen(rv_continuous): %(example)s """ - def _pdf(self, x, b): return exp(self._logpdf(x, b)) def _logpdf(self, x, b): - xb = x ** b + xb = x**b f = 1 + log(b) + special.xlogy(b - 1.0, x) + xb - exp(xb) return f def _cdf(self, x, b): - return -expm1(-expm1(x ** b)) + return -special.expm1(-special.expm1(x**b)) def _sf(self, x, b): - return exp(-expm1(x ** b)) + return exp(-special.expm1(x**b)) def _isf(self, x, b): - return (log1p(-log(x))) ** (1. / b) + return (special.log1p(-log(x)))**(1./b) def _ppf(self, q, b): - return pow(log1p(-log1p(-q)), 1.0 / b) + return pow(special.log1p(-special.log1p(-q)), 1.0/b) exponpow = exponpow_gen(a=0.0, name='exponpow') class fatiguelife_gen(rv_continuous): - """A fatigue-life (Birnbaum-Saunders) continuous random variable. %(before_notes)s @@ -1187,6 +1339,10 @@ class fatiguelife_gen(rv_continuous): for ``x > 0``. + `fatiguelife` takes ``c`` as a shape parameter. + + %(after_notes)s + References ---------- .. [1] "Birnbaum-Saunders distribution", @@ -1195,27 +1351,26 @@ class fatiguelife_gen(rv_continuous): %(example)s """ - def _rvs(self, c): - z = mtrand.standard_normal(self._size) - x = 0.5 * c * z - x2 = x * x - t = 1.0 + 2 * x2 + 2 * x * sqrt(1 + x2) + z = self._random_state.standard_normal(self._size) + x = 0.5*c*z + x2 = x*x + t = 1.0 + 2*x2 + 2*x*sqrt(1 + x2) return t def _pdf(self, x, c): return np.exp(self._logpdf(x, c)) def _logpdf(self, x, c): - return (log(x + 1) - (x - 1) ** 2 / (2.0 * x * c ** 2) - log(2 * c) - - 0.5 * (log(2 * pi) + 3 * log(x))) + return (log(x+1) - (x-1)**2 / (2.0*x*c**2) - log(2*c) - + 0.5*(log(2*pi) + 3*log(x))) def _cdf(self, x, c): - return special.ndtr(1.0 / c * (sqrt(x) - 1.0 / sqrt(x))) + return special.ndtr(1.0 / c * (sqrt(x) - 1.0/sqrt(x))) def _ppf(self, q, c): - tmp = c * special.ndtri(q) - return 0.25 * (tmp + sqrt(tmp ** 2 + 4)) ** 2 + tmp = c*special.ndtri(q) + return 0.25 * (tmp + sqrt(tmp**2 + 4))**2 def _stats(self, c): # NB: the formula for kurtosis in wikipedia seems to have an error: @@ -1223,18 +1378,17 @@ class fatiguelife_gen(rv_continuous): # Alpha. And the latter one, below, passes the tests, while the wiki # one doesn't So far I didn't have the guts to actually check the # coefficients from the expressions for the raw moments. - c2 = c * c + c2 = c*c mu = c2 / 2.0 + 1.0 den = 5.0 * c2 + 4.0 - mu2 = c2 * den / 4.0 - g1 = 4 * c * (11 * c2 + 6.0) / np.power(den, 1.5) - g2 = 6 * c2 * (93 * c2 + 40.0) / den ** 2.0 + mu2 = c2*den / 4.0 + g1 = 4 * c * (11*c2 + 6.0) / np.power(den, 1.5) + g2 = 6 * c2 * (93*c2 + 40.0) / den**2.0 return mu, mu2, g1, g2 fatiguelife = fatiguelife_gen(a=0.0, name='fatiguelife') class foldcauchy_gen(rv_continuous): - """A folded Cauchy continuous random variable. %(before_notes)s @@ -1247,18 +1401,20 @@ class foldcauchy_gen(rv_continuous): for ``x >= 0``. + `foldcauchy` takes ``c`` as a shape parameter. + %(example)s """ - def _rvs(self, c): - return abs(cauchy.rvs(loc=c, size=self._size)) + return abs(cauchy.rvs(loc=c, size=self._size, + random_state=self._random_state)) def _pdf(self, x, c): - return 1.0 / pi * (1.0 / (1 + (x - c) ** 2) + 1.0 / (1 + (x + c) ** 2)) + return 1.0/pi*(1.0/(1+(x-c)**2) + 1.0/(1+(x+c)**2)) def _cdf(self, x, c): - return 1.0 / pi * (arctan(x - c) + arctan(x + c)) + return 1.0/pi*(arctan(x-c) + arctan(x+c)) def _stats(self, c): return inf, inf, nan, nan @@ -1266,7 +1422,6 @@ foldcauchy = foldcauchy_gen(a=0.0, name='foldcauchy') class f_gen(rv_continuous): - """An F continuous random variable. %(before_notes)s @@ -1281,12 +1436,15 @@ class f_gen(rv_continuous): for ``x > 0``. + `f` takes ``dfn`` and ``dfd`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _rvs(self, dfn, dfd): - return mtrand.f(dfn, dfd, self._size) + return self._random_state.f(dfn, dfd, self._size) def _pdf(self, x, dfn, dfd): return exp(self._logpdf(x, dfn, dfd)) @@ -1294,8 +1452,8 @@ class f_gen(rv_continuous): def _logpdf(self, x, dfn, dfd): n = 1.0 * dfn m = 1.0 * dfd - lPx = m / 2 * log(m) + n / 2 * log(n) + (n / 2 - 1) * log(x) - lPx -= ((n + m) / 2) * log(m + n * x) + special.betaln(n / 2, m / 2) + lPx = m/2 * log(m) + n/2 * log(n) + (n/2 - 1) * log(x) + lPx -= ((n+m)/2) * log(m + n*x) + special.betaln(n/2, m/2) return lPx def _cdf(self, x, dfn, dfd): @@ -1319,7 +1477,7 @@ class f_gen(rv_continuous): mu2 = _lazywhere( v2 > 4, (v1, v2, v2_2, v2_4), lambda v1, v2, v2_2, v2_4: - 2 * v2 * v2 * (v1 + v2_2) / (v1 * v2_2 ** 2 * v2_4), + 2 * v2 * v2 * (v1 + v2_2) / (v1 * v2_2**2 * v2_4), np.inf) g1 = _lazywhere( @@ -1349,16 +1507,15 @@ class f_gen(rv_continuous): f = f_gen(a=0.0, name='f') -# Folded Normal -# abs(Z) where (Z is normal with mu=L and std=S so that c=abs(L)/S) +## Folded Normal +## abs(Z) where (Z is normal with mu=L and std=S so that c=abs(L)/S) ## -# note: regress docs have scale parameter correct, but first parameter -# he gives is a shape parameter A = c * scale +## note: regress docs have scale parameter correct, but first parameter +## he gives is a shape parameter A = c * scale -# Half-normal is folded normal with shape-parameter c=0. +## Half-normal is folded normal with shape-parameter c=0. class foldnorm_gen(rv_continuous): - """A folded normal continuous random variable. %(before_notes)s @@ -1371,49 +1528,51 @@ class foldnorm_gen(rv_continuous): for ``c >= 0``. + `foldnorm` takes ``c`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _argcheck(self, c): return (c >= 0) def _rvs(self, c): - return abs(mtrand.standard_normal(self._size) + c) + return abs(self._random_state.standard_normal(self._size) + c) def _pdf(self, x, c): - return _norm_pdf(x + c) + _norm_pdf(x - c) + return _norm_pdf(x + c) + _norm_pdf(x-c) def _cdf(self, x, c): - return special.ndtr(x - c) + special.ndtr(x + c) - 1.0 + return special.ndtr(x-c) + special.ndtr(x+c) - 1.0 def _stats(self, c): # Regina C. Elandt, Technometrics 3, 551 (1961) # http://www.jstor.org/stable/1266561 # - c2 = c * c - expfac = np.exp(-0.5 * c2) / np.sqrt(2. * pi) + c2 = c*c + expfac = np.exp(-0.5*c2) / np.sqrt(2.*pi) - mu = 2. * expfac + c * special.erf(c / sqrt(2)) - mu2 = c2 + 1 - mu * mu + mu = 2.*expfac + c * special.erf(c/sqrt(2)) + mu2 = c2 + 1 - mu*mu - g1 = 2. * (mu * mu * mu - c2 * mu - expfac) + g1 = 2. * (mu*mu*mu - c2*mu - expfac) g1 /= np.power(mu2, 1.5) - g2 = c2 * (c2 + 6.) + 3 + 8. * expfac * mu - g2 += (2. * (c2 - 3.) - 3. * mu ** 2) * mu ** 2 - g2 = g2 / mu2 ** 2.0 - 3. + g2 = c2 * (c2 + 6.) + 3 + 8.*expfac*mu + g2 += (2. * (c2 - 3.) - 3. * mu**2) * mu**2 + g2 = g2 / mu2**2.0 - 3. return mu, mu2, g1, g2 foldnorm = foldnorm_gen(a=0.0, name='foldnorm') -# Extreme Value Type II or Frechet -# (defined in Regress+ documentation as Extreme LB) as -# a limiting value distribution. +## Extreme Value Type II or Frechet +## (defined in Regress+ documentation as Extreme LB) as +## a limiting value distribution. ## class frechet_r_gen(rv_continuous): - """A Frechet right (or Weibull minimum) continuous random variable. %(before_notes)s @@ -1431,10 +1590,13 @@ class frechet_r_gen(rv_continuous): for ``x > 0``, ``c > 0``. + `frechet_r` takes ``c`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _link(self, x, logSF, phat, ix): if ix == 0: phati = log(-logSF) / log((x - phat[1]) / phat[2]) @@ -1447,19 +1609,19 @@ class frechet_r_gen(rv_continuous): return phati def _pdf(self, x, c): - return c * pow(x, c - 1) * exp(-pow(x, c)) + return c*pow(x, c-1)*exp(-pow(x, c)) def _logpdf(self, x, c): return log(c) + special.xlogy(c - 1, x) - pow(x, c) def _cdf(self, x, c): - return -expm1(-pow(x, c)) + return -special.expm1(-pow(x, c)) def _ppf(self, q, c): - return pow(-log1p(-q), 1.0 / c) + return pow(-special.log1p(-q), 1.0/c) def _munp(self, n, c): - return special.gamma(1.0 + n * 1.0 / c) + return special.gamma(1.0+n*1.0/c) def _entropy(self, c): return -_EULER / c - log(c) + _EULER + 1 @@ -1475,7 +1637,6 @@ weibull_min = frechet_r_gen(a=0.0, name='weibull_min') class frechet_l_gen(rv_continuous): - """A Frechet left (or Weibull maximum) continuous random variable. %(before_notes)s @@ -1493,21 +1654,24 @@ class frechet_l_gen(rv_continuous): for ``x < 0``, ``c > 0``. + `frechet_l` takes ``c`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _pdf(self, x, c): - return c * pow(-x, c - 1) * exp(-pow(-x, c)) + return c*pow(-x, c-1)*exp(-pow(-x, c)) def _cdf(self, x, c): return exp(-pow(-x, c)) def _ppf(self, q, c): - return -pow(-log(q), 1.0 / c) + return -pow(-log(q), 1.0/c) def _munp(self, n, c): - val = special.gamma(1.0 + n * 1.0 / c) + val = special.gamma(1.0+n*1.0/c) if (int(n) % 2): sgn = -1 else: @@ -1527,7 +1691,6 @@ weibull_max = frechet_l_gen(b=0.0, name='weibull_max') class genlogistic_gen(rv_continuous): - """A generalized logistic continuous random variable. %(before_notes)s @@ -1540,32 +1703,35 @@ class genlogistic_gen(rv_continuous): for ``x > 0``, ``c > 0``. + `genlogistic` takes ``c`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _pdf(self, x, c): return exp(self._logpdf(x, c)) def _logpdf(self, x, c): - return log(c) - x - (c + 1.0) * log1p(exp(-x)) + return log(c) - x - (c+1.0)*special.log1p(exp(-x)) def _cdf(self, x, c): - Cx = (1 + exp(-x)) ** (-c) + Cx = (1+exp(-x))**(-c) return Cx def _ppf(self, q, c): - vals = -log(pow(q, -1.0 / c) - 1) + vals = -log(pow(q, -1.0/c)-1) return vals def _stats(self, c): zeta = special.zeta mu = _EULER + special.psi(c) - mu2 = pi * pi / 6.0 + zeta(2, c) - g1 = -2 * zeta(3, c) + 2 * _ZETA3 + mu2 = pi*pi/6.0 + zeta(2, c) + g1 = -2*zeta(3, c) + 2*_ZETA3 g1 /= np.power(mu2, 1.5) - g2 = pi ** 4 / 15.0 + 6 * zeta(4, c) - g2 /= mu2 ** 2.0 + g2 = pi**4/15.0 + 6*zeta(4, c) + g2 /= mu2**2.0 return mu, mu2, g1, g2 genlogistic = genlogistic_gen(name='genlogistic') @@ -1579,7 +1745,6 @@ def log1pxdx(x): class genpareto_gen(rv_continuous): - """A generalized Pareto continuous random variable. %(before_notes)s @@ -1593,6 +1758,8 @@ class genpareto_gen(rv_continuous): defined for ``x >= 0`` if ``c >=0``, and for ``0 <= x <= -1/c`` if ``c < 0``. + `genpareto` takes ``c`` as a shape parameter. + For ``c == 0``, `genpareto` reduces to the exponential distribution, `expon`:: @@ -1602,6 +1769,8 @@ class genpareto_gen(rv_continuous): genpareto.cdf(x, c=-1) = x + %(after_notes)s + %(example)s """ @@ -1639,23 +1808,23 @@ class genpareto_gen(rv_continuous): return where(abs(c) == inf, False, True) def _pdf(self, x, c): - return exp(self._logpdf(x, c)) + return np.exp(self._logpdf(x, c)) def _logpdf(self, x, c): return _lazywhere((x == x) & (c != 0), (x, c), - lambda x, c: -special.xlog1py(c + 1., c * x) / c, - -x) + lambda x, c: -special.xlog1py(c+1., c*x) / c, + -x) def _cdf(self, x, c): - return - expm1(self._logsf(x, c)) + return -inv_boxcox1p(-x, -c) def _sf(self, x, c): - return exp(self._logsf(x, c)) + return inv_boxcox(-x, -c) def _logsf(self, x, c): return _lazywhere((x == x) & (c != 0), (x, c), - lambda x, c: -log1p(c * x) / c, - -x) + lambda x, c: -special.log1p(c*x) / c, + -x) def _ppf(self, q, c): return -boxcox1p(-q, -c) @@ -1756,8 +1925,8 @@ class genpareto_gen(rv_continuous): val = val + cnk * (-1) ** ki / (1.0 - c * ki) return where(c * n < 1, val * (-1.0 / c) ** n, inf) return _lazywhere(c != 0, (c,), - lambda c: __munp(n, c), - gam(n + 1)) + lambda c: __munp(n, c), + gam(n + 1)) def _entropy(self, c): return 1. + c @@ -1765,7 +1934,6 @@ genpareto = genpareto_gen(a=0.0, name='genpareto') class genexpon_gen(rv_continuous): - """A generalized exponential continuous random variable. %(before_notes)s @@ -1779,6 +1947,10 @@ class genexpon_gen(rv_continuous): for ``x >= 0``, ``a, b, c > 0``. + `genexpon` takes ``a``, ``b`` and ``c`` as shape parameters. + + %(after_notes)s + References ---------- H.K. Ryu, "An Extension of Marshall and Olkin's Bivariate Exponential @@ -1807,20 +1979,19 @@ class genexpon_gen(rv_continuous): return phati def _pdf(self, x, a, b, c): - return (a + b * (-expm1(-c * x))) * exp((-a - b) * x + - b * (-expm1(-c * x)) / c) + return (a + b*(-special.expm1(-c*x)))*exp((-a-b)*x + + b*(-special.expm1(-c*x))/c) def _cdf(self, x, a, b, c): - return -expm1((-a - b) * x + b * (-expm1(-c * x)) / c) + return -special.expm1((-a-b)*x + b*(-special.expm1(-c*x))/c) def _logpdf(self, x, a, b, c): - return (np.log(a + b * (-expm1(-c * x))) + (-a - b) * x + - b * (-expm1(-c * x)) / c) + return np.log(a+b*(-special.expm1(-c*x))) + \ + (-a-b)*x+b*(-special.expm1(-c*x))/c genexpon = genexpon_gen(a=0.0, name='genexpon') class genextreme_gen(rv_continuous): - """A generalized extreme value continuous random variable. %(before_notes)s @@ -1841,13 +2012,16 @@ class genextreme_gen(rv_continuous): Note that several sources and software packages use the opposite convention for the sign of the shape parameter ``c``. + `genextreme` takes ``c`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _argcheck(self, c): - min = np.minimum # @ReservedAssignment - max = np.maximum # @ReservedAssignment + min = np.minimum + max = np.maximum self.b = where(c > 0, 1.0 / max(c, _XMIN), inf) self.a = where(c < 0, 1.0 / min(c, -_XMIN), -inf) return where(abs(c) == inf, 0, 1) @@ -1888,40 +2062,40 @@ class genextreme_gen(rv_continuous): lambda x, c: -expm1(-c * x) / c, x) def _stats(self, c): - g = lambda n: gam(n * c + 1) + g = lambda n: gam(n*c+1) g1 = g(1) g2 = g(2) g3 = g(3) g4 = g(4) - g2mg12 = where(abs(c) < 1e-7, (c * pi) ** 2.0 / 6.0, g2 - g1 ** 2.0) - gam2k = where(abs(c) < 1e-7, pi ** 2.0 / 6.0, - expm1(gamln(2.0 * c + 1.0) - - 2 * gamln(c + 1.0)) / c ** 2.0) + g2mg12 = where(abs(c) < 1e-7, (c*pi)**2.0/6.0, g2-g1**2.0) + gam2k = where(abs(c) < 1e-7, pi**2.0/6.0, + special.expm1(gamln(2.0*c+1.0)-2*gamln(c+1.0))/c**2.0) eps = 1e-14 - gamk = where(abs(c) < eps, -_EULER, expm1(gamln(c + 1)) / c) + gamk = where(abs(c) < eps, -_EULER, special.expm1(gamln(c+1))/c) m = where(c < -1.0, nan, -gamk) - v = where(c < -0.5, nan, g1 ** 2.0 * gam2k) + v = where(c < -0.5, nan, g1**2.0*gam2k) # skewness - sk1 = where(c < -1. / 3, nan, - np.sign(c) * (-g3 + (g2 + 2 * g2mg12) * g1) / - ((g2mg12) ** (3. / 2.))) - sk = where(abs(c) <= eps ** 0.29, 12 * sqrt(6) * _ZETA3 / pi ** 3, sk1) + sk1 = where(c < -1./3, nan, + np.sign(c)*(-g3+(g2+2*g2mg12)*g1)/((g2mg12)**(3./2.))) + sk = where(abs(c) <= eps**0.29, 12*sqrt(6)*_ZETA3/pi**3, sk1) # kurtosis - ku1 = where(c < -1. / 4, nan, - (g4 + (-4 * g3 + 3 * (g2 + g2mg12) * g1) * g1) / - ((g2mg12) ** 2)) - ku = where(abs(c) <= (eps) ** 0.23, 12.0 / 5.0, ku1 - 3.0) + ku1 = where(c < -1./4, nan, + (g4+(-4*g3+3*(g2+g2mg12)*g1)*g1)/((g2mg12)**2)) + ku = where(abs(c) <= (eps)**0.23, 12.0/5.0, ku1-3.0) return m, v, sk, ku def _munp(self, n, c): - k = arange(0, n + 1) - vals = 1.0 / c ** n * sum( - comb(n, k) * (-1) ** k * special.gamma(c * k + 1), + k = arange(0, n+1) + vals = 1.0/c**n * np.sum( + comb(n, k) * (-1)**k * special.gamma(c*k + 1), axis=0) - return where(c * n > -1, vals, inf) + return where(c*n > -1, vals, inf) + + def _entropy(self, c): + return _EULER*(1 - c) + 1 def _fitstart(self, data): d = asarray(data) @@ -1967,26 +2141,25 @@ def _digammainv(y): value = optimize.newton(func, x0, tol=1e-10) return value elif y > -3: - x0 = exp(y / 2.332) + 0.08661 + x0 = exp(y/2.332) + 0.08661 else: x0 = 1.0 / (-y - _em) - value, _info, ier, _msg = optimize.fsolve(func, x0, xtol=1e-11, - full_output=True) + value, info, ier, mesg = optimize.fsolve(func, x0, xtol=1e-11, + full_output=True) if ier != 1: raise RuntimeError("_digammainv: fsolve failed, y = %r" % y) return value[0] -# Gamma (Use MATLAB and MATHEMATICA (b=theta=scale, a=alpha=shape) definition) +## Gamma (Use MATLAB and MATHEMATICA (b=theta=scale, a=alpha=shape) definition) -# gamma(a, loc, scale) with a an integer is the Erlang distribution -# gamma(1, loc, scale) is the Exponential distribution -# gamma(df/2, 0, 2) is the chi2 distribution with df degrees of freedom. +## gamma(a, loc, scale) with a an integer is the Erlang distribution +## gamma(1, loc, scale) is the Exponential distribution +## gamma(df/2, 0, 2) is the chi2 distribution with df degrees of freedom. class gamma_gen(rv_continuous): - """A gamma continuous random variable. %(before_notes)s @@ -1999,36 +2172,28 @@ class gamma_gen(rv_continuous): ----- The probability density function for `gamma` is:: - gamma.pdf(x, a) = lambda**a * x**(a-1) * exp(-lambda*x) / gamma(a) + gamma.pdf(x, a) = x**(a-1) * exp(-x) / gamma(a) for ``x >= 0``, ``a > 0``. Here ``gamma(a)`` refers to the gamma function. - The scale parameter is equal to ``scale = 1.0 / lambda``. - - `gamma` has a shape parameter `a` which needs to be set explicitly. For - instance: - - >>> from scipy.stats import gamma - >>> rv = gamma(3., loc = 0., scale = 2.) - - produces a frozen form of `gamma` with shape ``a = 3.``, ``loc =0.`` - and ``lambda = 1./scale = 1./2.``. + `gamma` has a shape parameter `a` which needs to be set explicitly. When ``a`` is an integer, `gamma` reduces to the Erlang distribution, and when ``a=1`` to the exponential distribution. + %(after_notes)s + %(example)s """ - def _rvs(self, a): - return mtrand.standard_gamma(a, self._size) + return self._random_state.standard_gamma(a, self._size) def _pdf(self, x, a): return exp(self._logpdf(x, a)) def _logpdf(self, x, a): - return special.xlogy(a - 1.0, x) - x - gamln(a) + return special.xlogy(a-1.0, x) - x - gamln(a) def _cdf(self, x, a): return special.gammainc(a, x) @@ -2040,10 +2205,10 @@ class gamma_gen(rv_continuous): return special.gammaincinv(a, q) def _stats(self, a): - return a, a, 2.0 / sqrt(a), 6.0 / a + return a, a, 2.0/sqrt(a), 6.0/a def _entropy(self, a): - return special.psi(a) * (1 - a) + a + gamln(a) + return special.psi(a)*(1-a) + a + gamln(a) def _fitstart(self, data): # The skewness of the gamma distribution is `4 / sqrt(a)`. @@ -2051,12 +2216,13 @@ class gamma_gen(rv_continuous): # of the data. The formula is regularized with 1e-8 in the # denominator to allow for degenerate data where the skewness # is close to 0. - a = 4 / (1e-8 + _skew(data) ** 2) + a = 4 / (1e-8 + _skew(data)**2) return super(gamma_gen, self)._fitstart(data, args=(a,)) @inherit_docstring_from(rv_continuous) def fit(self, data, *args, **kwds): - f0 = kwds.get('f0', None) + f0 = (kwds.get('f0', None) or kwds.get('fa', None) or + kwds.get('fix_a', None)) floc = kwds.get('floc', None) fscale = kwds.get('fscale', None) @@ -2099,9 +2265,9 @@ class gamma_gen(rv_continuous): # log(a) - special.digamma(a) - log(xbar) + log(data.mean) = 0 s = log(xbar) - log(data).mean() func = lambda a: log(a) - special.digamma(a) - s - aest = (3 - s + np.sqrt((s - 3) ** 2 + 24 * s)) / (12 * s) - xa = aest * (1 - 0.4) - xb = aest * (1 + 0.4) + aest = (3-s + np.sqrt((s-3)**2 + 24*s)) / (12*s) + xa = aest*(1-0.4) + xb = aest*(1+0.4) a = optimize.brentq(func, xa, xb, disp=0) # The MLE for the scale parameter is just the data mean @@ -2121,7 +2287,6 @@ gamma = gamma_gen(a=0.0, name='gamma') class erlang_gen(gamma_gen): - """An Erlang continuous random variable. %(before_notes)s @@ -2157,7 +2322,7 @@ class erlang_gen(gamma_gen): # Override gamma_gen_fitstart so that an integer initial value is # used. (Also regularize the division, to avoid issues when # _skew(data) is 0 or close to 0.) - a = int(4.0 / (1e-8 + _skew(data) ** 2)) + a = int(4.0 / (1e-8 + _skew(data)**2)) return super(gamma_gen, self)._fitstart(data, args=(a,)) # Trivial override of the fit method, so we can monkey-patch its @@ -2167,7 +2332,7 @@ class erlang_gen(gamma_gen): if fit.__doc__ is not None: fit.__doc__ = (rv_continuous.fit.__doc__ + - """ + """ Notes ----- The Erlang distribution is generally defined to have integer values @@ -2181,7 +2346,6 @@ erlang = erlang_gen(a=0.0, name='erlang') class gengamma_gen(rv_continuous): - """A generalized gamma continuous random variable. %(before_notes)s @@ -2194,42 +2358,55 @@ class gengamma_gen(rv_continuous): for ``x > 0``, ``a > 0``, and ``c != 0``. + `gengamma` takes ``a`` and ``c`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _argcheck(self, a, c): return (a > 0) & (c != 0) def _pdf(self, x, a, c): - return exp(self._logpdf(x, a, c)) + return np.exp(self._logpdf(x, a, c)) def _logpdf(self, x, a, c): - return log(abs(c)) + special.xlogy(c * a - 1, x) - x ** c - gamln(a) + return np.log(abs(c)) + special.xlogy(c*a - 1, x) - x**c - special.gammaln(a) def _cdf(self, x, a, c): - val = special.gammainc(a, x ** c) - cond = c + 0 * val - return where(cond > 0, val, 1 - val) + xc = x**c + val1 = special.gammainc(a, xc) + val2 = special.gammaincc(a, xc) + return np.where(c > 0, val1, val2) + + def _sf(self, x, a, c): + xc = x**c + val1 = special.gammainc(a, xc) + val2 = special.gammaincc(a, xc) + return np.where(c > 0, val2, val1) def _ppf(self, q, a, c): val1 = special.gammaincinv(a, q) - val2 = special.gammaincinv(a, 1.0 - q) - ic = 1.0 / c - cond = c + 0 * val1 - return where(cond > 0, val1 ** ic, val2 ** ic) + val2 = special.gammainccinv(a, q) + return np.where(c > 0, val1, val2)**(1.0/c) + + def _isf(self, q, a, c): + val1 = special.gammaincinv(a, q) + val2 = special.gammainccinv(a, q) + return np.where(c > 0, val2, val1)**(1.0/c) def _munp(self, n, a, c): - return special.gamma(a + n * 1.0 / c) / special.gamma(a) + # Pochhammer symbol: poch(a,n) = gamma(a+n)/gamma(a) + return special.poch(a, n*1.0/c) def _entropy(self, a, c): val = special.psi(a) - return a * (1 - val) + 1.0 / c * val + gamln(a) - log(abs(c)) + return a*(1-val) + 1.0/c*val + special.gammaln(a) - np.log(abs(c)) gengamma = gengamma_gen(a=0.0, name='gengamma') class genhalflogistic_gen(rv_continuous): - """A generalized half-logistic continuous random variable. %(before_notes)s @@ -2242,37 +2419,39 @@ class genhalflogistic_gen(rv_continuous): for ``0 <= x <= 1/c``, and ``c > 0``. + `genhalflogistic` takes ``c`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _argcheck(self, c): self.b = 1.0 / c return (c > 0) def _pdf(self, x, c): - limit = 1.0 / c - tmp = asarray(1 - c * x) - tmp0 = tmp ** (limit - 1) - tmp2 = tmp0 * tmp - return 2 * tmp0 / (1 + tmp2) ** 2 + limit = 1.0/c + tmp = asarray(1-c*x) + tmp0 = tmp**(limit-1) + tmp2 = tmp0*tmp + return 2*tmp0 / (1+tmp2)**2 def _cdf(self, x, c): - limit = 1.0 / c - tmp = asarray(1 - c * x) - tmp2 = tmp ** (limit) - return (1.0 - tmp2) / (1 + tmp2) + limit = 1.0/c + tmp = asarray(1-c*x) + tmp2 = tmp**(limit) + return (1.0-tmp2) / (1+tmp2) def _ppf(self, q, c): - return 1.0 / c * (1 - ((1.0 - q) / (1.0 + q)) ** c) + return 1.0/c*(1-((1.0-q)/(1.0+q))**c) def _entropy(self, c): - return 2 - (2 * c + 1) * log(2) + return 2 - (2*c+1)*log(2) genhalflogistic = genhalflogistic_gen(a=0.0, name='genhalflogistic') class gompertz_gen(rv_continuous): - """A Gompertz (or truncated Gumbel) continuous random variable. %(before_notes)s @@ -2285,29 +2464,31 @@ class gompertz_gen(rv_continuous): for ``x >= 0``, ``c > 0``. + `gompertz` takes ``c`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _pdf(self, x, c): return exp(self._logpdf(x, c)) def _logpdf(self, x, c): - return log(c) + x - c * expm1(x) + return log(c) + x - c * special.expm1(x) def _cdf(self, x, c): - return -expm1(-c * expm1(x)) + return -special.expm1(-c * special.expm1(x)) def _ppf(self, q, c): - return log1p(-1.0 / c * log1p(-q)) + return special.log1p(-1.0 / c * special.log1p(-q)) def _entropy(self, c): - return 1.0 - log(c) - exp(c) * special.expn(1, c) + return 1.0 - log(c) - exp(c)*special.expn(1, c) gompertz = gompertz_gen(a=0.0, name='gompertz') class gumbel_r_gen(rv_continuous): - """A right-skewed Gumbel continuous random variable. %(before_notes)s @@ -2326,10 +2507,11 @@ class gumbel_r_gen(rv_continuous): distribution. It is also related to the extreme value distribution, log-Weibull and Gompertz distributions. + %(after_notes)s + %(example)s """ - def _pdf(self, x): return exp(self._logpdf(x)) @@ -2346,7 +2528,7 @@ class gumbel_r_gen(rv_continuous): return -log(-log(q)) def _stats(self): - return _EULER, pi * pi / 6.0, 12 * sqrt(6) / pi ** 3 * _ZETA3, 12.0 / 5 + return _EULER, pi*pi/6.0, 12*sqrt(6)/pi**3 * _ZETA3, 12.0/5 def _entropy(self): # http://en.wikipedia.org/wiki/Gumbel_distribution @@ -2355,7 +2537,6 @@ gumbel_r = gumbel_r_gen(name='gumbel_r') class gumbel_l_gen(rv_continuous): - """A left-skewed Gumbel continuous random variable. %(before_notes)s @@ -2374,10 +2555,11 @@ class gumbel_l_gen(rv_continuous): distribution. It is also related to the extreme value distribution, log-Weibull and Gompertz distributions. + %(after_notes)s + %(example)s """ - def _pdf(self, x): return exp(self._logpdf(x)) @@ -2391,8 +2573,8 @@ class gumbel_l_gen(rv_continuous): return log(-log1p(-q)) def _stats(self): - return -_EULER, pi * pi / 6.0, \ - -12 * sqrt(6) / pi ** 3 * _ZETA3, 12.0 / 5 + return -_EULER, pi*pi/6.0, \ + -12*sqrt(6)/pi**3 * _ZETA3, 12.0/5 def _entropy(self): return _EULER + 1. @@ -2400,7 +2582,6 @@ gumbel_l = gumbel_l_gen(name='gumbel_l') class halfcauchy_gen(rv_continuous): - """A Half-Cauchy continuous random variable. %(before_notes)s @@ -2413,32 +2594,32 @@ class halfcauchy_gen(rv_continuous): for ``x >= 0``. + %(after_notes)s + %(example)s """ - def _pdf(self, x): - return 2.0 / pi / (1.0 + x * x) + return 2.0/pi/(1.0+x*x) def _logpdf(self, x): - return np.log(2.0 / pi) - special.log1p(x * x) + return np.log(2.0/pi) - special.log1p(x*x) def _cdf(self, x): - return 2.0 / pi * arctan(x) + return 2.0/pi*arctan(x) def _ppf(self, q): - return tan(pi / 2 * q) + return tan(pi/2*q) def _stats(self): return inf, inf, nan, nan def _entropy(self): - return log(2 * pi) + return log(2*pi) halfcauchy = halfcauchy_gen(a=0.0, name='halfcauchy') class halflogistic_gen(rv_continuous): - """A half-logistic continuous random variable. %(before_notes)s @@ -2451,10 +2632,11 @@ class halflogistic_gen(rv_continuous): for ``x >= 0``. + %(after_notes)s + %(example)s """ - def _pdf(self, x): return exp(self._logpdf(x)) @@ -2462,30 +2644,28 @@ class halflogistic_gen(rv_continuous): return log(2) - x - 2. * special.log1p(exp(-x)) def _cdf(self, x): - return tanh(x / 2.0) + return tanh(x/2.0) def _ppf(self, q): - return 2 * arctanh(q) + return 2*arctanh(q) def _munp(self, n): if n == 1: - return 2 * log(2) + return 2*log(2) if n == 2: - return pi * pi / 3.0 + return pi*pi/3.0 if n == 3: - return 9 * _ZETA3 + return 9*_ZETA3 if n == 4: - return 7 * pi ** 4 / 15.0 - return 2 * (1 - pow(2.0, 1 - n)) * \ - special.gamma(n + 1) * special.zeta(n, 1) + return 7*pi**4 / 15.0 + return 2*(1-pow(2.0, 1-n))*special.gamma(n+1)*special.zeta(n, 1) def _entropy(self): - return 2 - log(2) + return 2-log(2) halflogistic = halflogistic_gen(a=0.0, name='halflogistic') class halfnorm_gen(rv_continuous): - """A half-normal continuous random variable. %(before_notes)s @@ -2500,37 +2680,36 @@ class halfnorm_gen(rv_continuous): `halfnorm` is a special case of `chi` with ``df == 1``. + %(after_notes)s + %(example)s """ - def _rvs(self): - return abs(mtrand.standard_normal(size=self._size)) + return abs(self._random_state.standard_normal(size=self._size)) def _pdf(self, x): - return sqrt(2.0 / pi) * exp(-x * x / 2.0) + return sqrt(2.0/pi)*exp(-x*x/2.0) def _logpdf(self, x): - return 0.5 * np.log(2.0 / pi) - x * x / 2.0 + return 0.5 * np.log(2.0/pi) - x*x/2.0 def _cdf(self, x): - return special.ndtr(x) * 2 - 1.0 + return special.ndtr(x)*2-1.0 def _ppf(self, q): - return special.ndtri((1 + q) / 2.0) + return special.ndtri((1+q)/2.0) def _stats(self): - return (sqrt(2.0 / pi), - 1 - 2.0 / pi, sqrt(2) * (4 - pi) / (pi - 2) ** 1.5, - 8 * (pi - 3) / (pi - 2) ** 2) + return (sqrt(2.0/pi), 1-2.0/pi, sqrt(2)*(4-pi)/(pi-2)**1.5, + 8*(pi-3)/(pi-2)**2) def _entropy(self): - return 0.5 * log(pi / 2.0) + 0.5 + return 0.5*log(pi/2.0)+0.5 halfnorm = halfnorm_gen(a=0.0, name='halfnorm') class hypsecant_gen(rv_continuous): - """A hyperbolic secant continuous random variable. %(before_notes)s @@ -2541,29 +2720,29 @@ class hypsecant_gen(rv_continuous): hypsecant.pdf(x) = 1/pi * sech(x) + %(after_notes)s + %(example)s """ - def _pdf(self, x): - return 1.0 / (pi * cosh(x)) + return 1.0/(pi*cosh(x)) def _cdf(self, x): - return 2.0 / pi * arctan(exp(x)) + return 2.0/pi*arctan(exp(x)) def _ppf(self, q): - return log(tan(pi * q / 2.0)) + return log(tan(pi*q/2.0)) def _stats(self): - return 0, pi * pi / 4, 0, 2 + return 0, pi*pi/4, 0, 2 def _entropy(self): - return log(2 * pi) + return log(2*pi) hypsecant = hypsecant_gen(name='hypsecant') class gausshyper_gen(rv_continuous): - """A Gauss hypergeometric continuous random variable. %(before_notes)s @@ -2578,28 +2757,29 @@ class gausshyper_gen(rv_continuous): for ``0 <= x <= 1``, ``a > 0``, ``b > 0``, and ``C = 1 / (B(a, b) F[2, 1](c, a; a+b; -z))`` + `gausshyper` takes ``a``, ``b``, ``c`` and ``z`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _argcheck(self, a, b, c, z): return (a > 0) & (b > 0) & (c == c) & (z == z) def _pdf(self, x, a, b, c, z): - Cinv = gam(a) * gam(b) / gam(a + b) * special.hyp2f1(c, a, a + b, -z) - return 1.0 / Cinv * \ - x ** (a - 1.0) * (1.0 - x) ** (b - 1.0) / (1.0 + z * x) ** c + Cinv = gam(a)*gam(b)/gam(a+b)*special.hyp2f1(c, a, a+b, -z) + return 1.0/Cinv * x**(a-1.0) * (1.0-x)**(b-1.0) / (1.0+z*x)**c def _munp(self, n, a, b, c, z): - fac = special.beta(n + a, b) / special.beta(a, b) - num = special.hyp2f1(c, a + n, a + b + n, -z) - den = special.hyp2f1(c, a, a + b, -z) - return fac * num / den + fac = special.beta(n+a, b) / special.beta(a, b) + num = special.hyp2f1(c, a+n, a+b+n, -z) + den = special.hyp2f1(c, a, a+b, -z) + return fac*num / den gausshyper = gausshyper_gen(a=0.0, b=1.0, name='gausshyper') class invgamma_gen(rv_continuous): - """An inverted gamma continuous random variable. %(before_notes)s @@ -2612,27 +2792,30 @@ class invgamma_gen(rv_continuous): for x > 0, a > 0. + `invgamma` takes ``a`` as a shape parameter. + `invgamma` is a special case of `gengamma` with ``c == -1``. + %(after_notes)s + %(example)s """ - def _pdf(self, x, a): return exp(self._logpdf(x, a)) def _logpdf(self, x, a): - return (-(a + 1) * log(x) - gamln(a) - 1.0 / x) + return (-(a+1) * log(x) - gamln(a) - 1.0/x) def _cdf(self, x, a): - return 1.0 - special.gammainc(a, 1.0 / x) + return 1.0 - special.gammainc(a, 1.0/x) def _ppf(self, q, a): - return 1.0 / special.gammaincinv(a, 1. - q) + return 1.0 / special.gammaincinv(a, 1.-q) def _stats(self, a, moments='mvsk'): m1 = _lazywhere(a > 1, (a,), lambda x: 1. / (x - 1.), np.inf) - m2 = _lazywhere(a > 2, (a,), lambda x: 1. / (x - 1.) ** 2 / (x - 2.), + m2 = _lazywhere(a > 2, (a,), lambda x: 1. / (x - 1.)**2 / (x - 2.), np.inf) g1, g2 = None, None @@ -2647,13 +2830,12 @@ class invgamma_gen(rv_continuous): return m1, m2, g1, g2 def _entropy(self, a): - return a - (a + 1.0) * special.psi(a) + gamln(a) + return a - (a+1.0) * special.psi(a) + gamln(a) invgamma = invgamma_gen(a=0.0, name='invgamma') # scale is gamma from DATAPLOT and B from Regress class invgauss_gen(rv_continuous): - """An inverse Gaussian continuous random variable. %(before_notes)s @@ -2666,6 +2848,10 @@ class invgauss_gen(rv_continuous): for ``x > 0``. + `invgauss` takes ``mu`` as a shape parameter. + + %(after_notes)s + When `mu` is too small, evaluating the cumulative density function will be inaccurate due to ``cdf(mu -> 0) = inf * 0``. NaNs are returned for ``mu <= 0.0028``. @@ -2673,32 +2859,28 @@ class invgauss_gen(rv_continuous): %(example)s """ - def _rvs(self, mu): - return mtrand.wald(mu, 1.0, size=self._size) + return self._random_state.wald(mu, 1.0, size=self._size) def _pdf(self, x, mu): - return 1.0 / sqrt(2 * pi * x ** 3.0) * \ - exp(-1.0 / (2 * x) * ((x - mu) / mu) ** 2) + return 1.0/sqrt(2*pi*x**3.0)*exp(-1.0/(2*x)*((x-mu)/mu)**2) def _logpdf(self, x, mu): - return -0.5 * log(2 * pi) - 1.5 * log(x) - \ - ((x - mu) / mu) ** 2 / (2 * x) + return -0.5*log(2*pi) - 1.5*log(x) - ((x-mu)/mu)**2/(2*x) def _cdf(self, x, mu): - fac = sqrt(1.0 / x) + fac = sqrt(1.0/x) # Numerical accuracy for small `mu` is bad. See #869. - C1 = _norm_cdf(fac * (x - mu) / mu) - C1 += exp(1.0 / mu) * _norm_cdf(-fac * (x + mu) / mu) * exp(1.0 / mu) + C1 = _norm_cdf(fac*(x-mu)/mu) + C1 += exp(1.0/mu) * _norm_cdf(-fac*(x+mu)/mu) * exp(1.0/mu) return C1 def _stats(self, mu): - return mu, mu ** 3.0, 3 * sqrt(mu), 15 * mu + return mu, mu**3.0, 3*sqrt(mu), 15*mu invgauss = invgauss_gen(a=0.0, name='invgauss') class invweibull_gen(rv_continuous): - """An inverted Weibull continuous random variable. %(before_notes)s @@ -2711,6 +2893,10 @@ class invweibull_gen(rv_continuous): for ``x > 0``, ``c > 0``. + `invweibull` takes ``c`` as a shape parameter. + + %(after_notes)s + References ---------- F.R.S. de Gusmao, E.M.M Ortega and G.M. Cordeiro, "The generalized inverse @@ -2719,7 +2905,6 @@ class invweibull_gen(rv_continuous): %(example)s """ - def _pdf(self, x, c): xc1 = np.power(x, -c - 1.0) xc2 = np.power(x, -c) @@ -2731,18 +2916,17 @@ class invweibull_gen(rv_continuous): return exp(-xc1) def _ppf(self, q, c): - return np.power(-log(q), -1.0 / c) + return np.power(-log(q), -1.0/c) def _munp(self, n, c): return special.gamma(1 - n / c) def _entropy(self, c): - return 1 + _EULER + _EULER / c - log(c) + return 1+_EULER + _EULER / c - log(c) invweibull = invweibull_gen(a=0, name='invweibull') class johnsonsb_gen(rv_continuous): - """A Johnson SB continuous random variable. %(before_notes)s @@ -2759,19 +2943,22 @@ class johnsonsb_gen(rv_continuous): for ``0 < x < 1`` and ``a, b > 0``, and ``phi`` is the normal pdf. + `johnsonsb` takes ``a`` and ``b`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _argcheck(self, a, b): return (b > 0) & (a == a) def _pdf(self, x, a, b): - trm = _norm_pdf(a + b * log(x / (1.0 - x))) - return b * 1.0 / (x * (1 - x)) * trm + trm = _norm_pdf(a + b*log(x/(1.0-x))) + return b*1.0/(x*(1-x))*trm def _cdf(self, x, a, b): - return _norm_cdf(a + b * log(x / (1.0 - x))) + return _norm_cdf(a + b*log(x/(1.0-x))) def _ppf(self, q, a, b): return 1.0 / (1 + exp(-1.0 / b * (_norm_ppf(q) - a))) @@ -2779,7 +2966,6 @@ johnsonsb = johnsonsb_gen(a=0.0, b=1.0, name='johnsonsb') class johnsonsu_gen(rv_continuous): - """A Johnson SU continuous random variable. %(before_notes)s @@ -2797,20 +2983,23 @@ class johnsonsu_gen(rv_continuous): for all ``x, a, b > 0``, and `phi` is the normal pdf. + `johnsonsu` takes ``a`` and ``b`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _argcheck(self, a, b): return (b > 0) & (a == a) def _pdf(self, x, a, b): - x2 = x * x - trm = _norm_pdf(a + b * log(x + sqrt(x2 + 1))) - return b * 1.0 / sqrt(x2 + 1.0) * trm + x2 = x*x + trm = _norm_pdf(a + b * log(x + sqrt(x2+1))) + return b*1.0/sqrt(x2+1.0)*trm def _cdf(self, x, a, b): - return _norm_cdf(a + b * log(x + sqrt(x * x + 1))) + return _norm_cdf(a + b * log(x + sqrt(x*x + 1))) def _ppf(self, q, a, b): return sinh((_norm_ppf(q) - a) / b) @@ -2818,7 +3007,6 @@ johnsonsu = johnsonsu_gen(name='johnsonsu') class laplace_gen(rv_continuous): - """A Laplace continuous random variable. %(before_notes)s @@ -2829,18 +3017,19 @@ class laplace_gen(rv_continuous): laplace.pdf(x) = 1/2 * exp(-abs(x)) + %(after_notes)s + %(example)s """ - def _rvs(self): - return mtrand.laplace(0, 1, size=self._size) + return self._random_state.laplace(0, 1, size=self._size) def _pdf(self, x): - return 0.5 * exp(-abs(x)) + return 0.5*exp(-abs(x)) def _cdf(self, x): - return where(x > 0, 1.0 - 0.5 * exp(-x), 0.5 * exp(x)) + return where(x > 0, 1.0-0.5*exp(-x), 0.5*exp(x)) def _ppf(self, q): return where(q > 0.5, -log(2) - log1p(-q), log(2 * q)) @@ -2849,12 +3038,11 @@ class laplace_gen(rv_continuous): return 0, 2, 0, 3 def _entropy(self): - return log(2) + 1 + return log(2)+1 laplace = laplace_gen(name='laplace') class levy_gen(rv_continuous): - """A Levy continuous random variable. %(before_notes)s @@ -2873,12 +3061,13 @@ class levy_gen(rv_continuous): This is the same as the Levy-stable distribution with a=1/2 and b=1. + %(after_notes)s + %(example)s """ - def _pdf(self, x): - return 1 / sqrt(2 * pi * x) / x * exp(-1 / (2 * x)) + return 1 / sqrt(2*pi*x) / x * exp(-1/(2*x)) def _cdf(self, x): # Equivalent to 2*norm.sf(sqrt(1/x)) @@ -2886,7 +3075,7 @@ class levy_gen(rv_continuous): def _ppf(self, q): # Equivalent to 1.0/(norm.isf(q/2)**2) or 0.5/(erfcinv(q)**2) - val = -special.ndtri(q / 2) + val = -special.ndtri(q/2) return 1.0 / (val * val) def _stats(self): @@ -2895,7 +3084,6 @@ levy = levy_gen(a=0.0, name="levy") class levy_l_gen(rv_continuous): - """A left-skewed Levy continuous random variable. %(before_notes)s @@ -2914,13 +3102,14 @@ class levy_l_gen(rv_continuous): This is the same as the Levy-stable distribution with a=1/2 and b=-1. + %(after_notes)s + %(example)s """ - def _pdf(self, x): ax = abs(x) - return 1 / sqrt(2 * pi * ax) / ax * exp(-1 / (2 * ax)) + return 1/sqrt(2*pi*ax)/ax*exp(-1/(2*ax)) def _cdf(self, x): ax = abs(x) @@ -2936,7 +3125,6 @@ levy_l = levy_l_gen(b=0.0, name="levy_l") class levy_stable_gen(rv_continuous): - """A Levy-stable continuous random variable. %(before_notes)s @@ -2950,29 +3138,27 @@ class levy_stable_gen(rv_continuous): Levy-stable distribution (only random variates available -- ignore other docs) + %(after_notes)s + %(example)s """ - def _rvs(self, alpha, beta): sz = self._size - TH = uniform.rvs(loc=-pi / 2.0, scale=pi, size=sz) + TH = uniform.rvs(loc=-pi/2.0, scale=pi, size=sz) W = expon.rvs(size=sz) if alpha == 1: - return 2 / pi * (pi / 2 + beta * TH) * tan(TH) - beta * \ - log((pi / 2 * W * cos(TH)) / (pi / 2 + beta * TH)) + return 2/pi*(pi/2+beta*TH)*tan(TH)-beta*log((pi/2*W*cos(TH))/(pi/2+beta*TH)) - ialpha = 1.0 / alpha - aTH = alpha * TH + ialpha = 1.0/alpha + aTH = alpha*TH if beta == 0: - return W / (cos(TH) / tan(aTH) + sin(TH)) * \ - ((cos(aTH) + sin(aTH) * tan(TH)) / W) ** ialpha - - val0 = beta * tan(pi * alpha / 2) - th0 = arctan(val0) / alpha - val3 = W / (cos(TH) / tan(alpha * (th0 + TH)) + sin(TH)) - res3 = val3 * ((cos(aTH) + sin(aTH) * tan(TH) - val0 * - (sin(aTH) - cos(aTH) * tan(TH))) / W) ** ialpha + return W/(cos(TH)/tan(aTH)+sin(TH))*((cos(aTH)+sin(aTH)*tan(TH))/W)**ialpha + + val0 = beta*tan(pi*alpha/2) + th0 = arctan(val0)/alpha + val3 = W/(cos(TH)/tan(alpha*(th0+TH))+sin(TH)) + res3 = val3*((cos(aTH)+sin(aTH)*tan(TH)-val0*(sin(aTH)-cos(aTH)*tan(TH)))/W)**ialpha return res3 def _argcheck(self, alpha, beta): @@ -2988,7 +3174,6 @@ levy_stable = levy_stable_gen(name='levy_stable') class logistic_gen(rv_continuous): - """A logistic (or Sech-squared) continuous random variable. %(before_notes)s @@ -3001,12 +3186,13 @@ class logistic_gen(rv_continuous): `logistic` is a special case of `genlogistic` with ``c == 1``. + %(after_notes)s + %(example)s """ - def _rvs(self): - return mtrand.logistic(size=self._size) + return self._random_state.logistic(size=self._size) def _pdf(self, x): return exp(self._logpdf(x)) @@ -3021,7 +3207,7 @@ class logistic_gen(rv_continuous): return -log1p(-q) + log(q) def _stats(self): - return 0, pi * pi / 3.0, 0, 6.0 / 5.0 + return 0, pi*pi/3.0, 0, 6.0/5.0 def _entropy(self): # http://en.wikipedia.org/wiki/Logistic_distribution @@ -3030,7 +3216,6 @@ logistic = logistic_gen(name='logistic') class loggamma_gen(rv_continuous): - """A log gamma continuous random variable. %(before_notes)s @@ -3043,15 +3228,18 @@ class loggamma_gen(rv_continuous): for all ``x, c > 0``. + `loggamma` takes ``c`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _rvs(self, c): - return log(mtrand.gamma(c, size=self._size)) + return log(self._random_state.gamma(c, size=self._size)) def _pdf(self, x, c): - return exp(c * x - exp(x) - gamln(c)) + return exp(c*x-exp(x)-gamln(c)) def _cdf(self, x, c): return special.gammainc(c, exp(x)) @@ -3065,14 +3253,13 @@ class loggamma_gen(rv_continuous): mean = special.digamma(c) var = special.polygamma(1, c) skewness = special.polygamma(2, c) / np.power(var, 1.5) - excess_kurtosis = special.polygamma(3, c) / (var * var) + excess_kurtosis = special.polygamma(3, c) / (var*var) return mean, var, skewness, excess_kurtosis loggamma = loggamma_gen(name='loggamma') class loglaplace_gen(rv_continuous): - """A log-Laplace continuous random variable. %(before_notes)s @@ -3086,6 +3273,10 @@ class loglaplace_gen(rv_continuous): for ``c > 0``. + `loglaplace` takes ``c`` as a shape parameter. + + %(after_notes)s + References ---------- T.J. Kozubowski and K. Podgorski, "A log-Laplace growth rate model", @@ -3094,34 +3285,32 @@ class loglaplace_gen(rv_continuous): %(example)s """ - def _pdf(self, x, c): - cd2 = c / 2.0 + cd2 = c/2.0 c = where(x < 1, c, -c) - return cd2 * x ** (c - 1) + return cd2*x**(c-1) def _cdf(self, x, c): - return where(x < 1, 0.5 * x ** c, 1 - 0.5 * x ** (-c)) + return where(x < 1, 0.5*x**c, 1-0.5*x**(-c)) def _ppf(self, q, c): - return where( - q < 0.5, (2.0 * q) ** (1.0 / c), (2 * (1.0 - q)) ** (-1.0 / c)) + return where(q < 0.5, (2.0*q)**(1.0/c), (2*(1.0-q))**(-1.0/c)) def _munp(self, n, c): - return c ** 2 / (c ** 2 - n ** 2) + return c**2 / (c**2 - n**2) def _entropy(self, c): - return log(2.0 / c) + 1.0 + return log(2.0/c) + 1.0 loglaplace = loglaplace_gen(a=0.0, name='loglaplace') def _lognorm_logpdf(x, s): - return -log(x) ** 2 / (2 * s ** 2) + \ - np.where(x == 0, 0, -log(s * x * sqrt(2 * pi))) + return _lazywhere(x != 0, (x, s), + lambda x, s: -log(x)**2 / (2*s**2) - log(s*x*sqrt(2*pi)), + -np.inf) class lognorm_gen(rv_continuous): - """A lognormal continuous random variable. %(before_notes)s @@ -3134,16 +3323,21 @@ class lognorm_gen(rv_continuous): for ``x > 0``, ``s > 0``. - If ``log(x)`` is normally distributed with mean ``mu`` and variance - ``sigma**2``, then ``x`` is log-normally distributed with shape parameter - sigma and scale parameter ``exp(mu)``. + `lognorm` takes ``s`` as a shape parameter. + + %(after_notes)s + + A common parametrization for a lognormal random variable ``Y`` is in + terms of the mean, ``mu``, and standard deviation, ``sigma``, of the + unique normally distributed random variable ``X`` such that exp(X) = Y. + This parametrization corresponds to setting ``s = sigma`` and ``scale = + exp(mu)``. %(example)s """ - def _rvs(self, s): - return exp(s * mtrand.standard_normal(self._size)) + return exp(s * self._random_state.standard_normal(self._size)) def _pdf(self, x, s): return exp(self._logpdf(x, s)) @@ -3158,15 +3352,15 @@ class lognorm_gen(rv_continuous): return exp(s * _norm_ppf(q)) def _stats(self, s): - p = exp(s * s) + p = exp(s*s) mu = sqrt(p) - mu2 = p * (p - 1) - g1 = sqrt((p - 1)) * (2 + p) + mu2 = p*(p-1) + g1 = sqrt((p-1))*(2+p) g2 = np.polyval([1, 2, 3, 0, -6.0], p) return mu, mu2, g1, g2 def _entropy(self, s): - return 0.5 * (1 + log(2 * pi) + 2 * log(s)) + return 0.5 * (1 + log(2*pi) + 2 * log(s)) def _fitstart(self, data): scale = data.std() @@ -3179,7 +3373,6 @@ lognorm = lognorm_gen(a=0.0, name='lognorm') class gilbrat_gen(rv_continuous): - """A Gilbrat continuous random variable. %(before_notes)s @@ -3192,12 +3385,13 @@ class gilbrat_gen(rv_continuous): `gilbrat` is a special case of `lognorm` with ``s = 1``. + %(after_notes)s + %(example)s """ - def _rvs(self): - return exp(mtrand.standard_normal(self._size)) + return exp(self._random_state.standard_normal(self._size)) def _pdf(self, x): return exp(self._logpdf(x)) @@ -3230,7 +3424,6 @@ gilbrat = gilbrat_gen(a=0.0, name='gilbrat') class maxwell_gen(rv_continuous): - """A Maxwell continuous random variable. %(before_notes)s @@ -3247,38 +3440,37 @@ class maxwell_gen(rv_continuous): for ``x > 0``. + %(after_notes)s + References ---------- .. [1] http://mathworld.wolfram.com/MaxwellDistribution.html %(example)s """ - def _rvs(self): - return chi.rvs(3.0, size=self._size) + return chi.rvs(3.0, size=self._size, random_state=self._random_state) def _pdf(self, x): - return sqrt(2.0 / pi) * x * x * exp(-x * x / 2.0) + return sqrt(2.0/pi)*x*x*exp(-x*x/2.0) def _cdf(self, x): - return special.gammainc(1.5, x * x / 2.0) + return special.gammainc(1.5, x*x/2.0) def _ppf(self, q): - return sqrt(2 * special.gammaincinv(1.5, q)) + return sqrt(2*special.gammaincinv(1.5, q)) def _stats(self): - val = 3 * pi - 8 - return (2 * sqrt(2.0 / pi), 3 - - 8 / pi, sqrt(2) * (32 - 10 * pi) / val ** 1.5, - (-12 * pi * pi + 160 * pi - 384) / val ** 2.0) + val = 3*pi-8 + return (2*sqrt(2.0/pi), 3-8/pi, sqrt(2)*(32-10*pi)/val**1.5, + (-12*pi*pi + 160*pi - 384) / val**2.0) def _entropy(self): - return _EULER + 0.5 * log(2 * pi) - 0.5 + return _EULER + 0.5*log(2*pi)-0.5 maxwell = maxwell_gen(a=0.0, name='maxwell') class mielke_gen(rv_continuous): - """A Mielke's Beta-Kappa continuous random variable. %(before_notes)s @@ -3291,24 +3483,26 @@ class mielke_gen(rv_continuous): for ``x > 0``. + `mielke` takes ``k`` and ``s`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _pdf(self, x, k, s): - return k * x ** (k - 1.0) / (1.0 + x ** s) ** (1.0 + k * 1.0 / s) + return k*x**(k-1.0) / (1.0+x**s)**(1.0+k*1.0/s) def _cdf(self, x, k, s): - return x ** k / (1.0 + x ** s) ** (k * 1.0 / s) + return x**k / (1.0+x**s)**(k*1.0/s) def _ppf(self, q, k, s): - qsk = pow(q, s * 1.0 / k) - return pow(qsk / (1.0 - qsk), 1.0 / s) + qsk = pow(q, s*1.0/k) + return pow(qsk/(1.0-qsk), 1.0/s) mielke = mielke_gen(a=0.0, name='mielke') class nakagami_gen(rv_continuous): - """A Nakagami continuous random variable. %(before_notes)s @@ -3322,32 +3516,33 @@ class nakagami_gen(rv_continuous): for ``x > 0``, ``nu > 0``. + `nakagami` takes ``nu`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _pdf(self, x, nu): - return 2 * nu ** nu / \ - gam(nu) * (x ** (2 * nu - 1.0)) * exp(-nu * x * x) + return 2*nu**nu/gam(nu)*(x**(2*nu-1.0))*exp(-nu*x*x) def _cdf(self, x, nu): - return special.gammainc(nu, nu * x * x) + return special.gammainc(nu, nu*x*x) def _ppf(self, q, nu): - return sqrt(1.0 / nu * special.gammaincinv(nu, q)) + return sqrt(1.0/nu*special.gammaincinv(nu, q)) def _stats(self, nu): - mu = gam(nu + 0.5) / gam(nu) / sqrt(nu) - mu2 = 1.0 - mu * mu - g1 = mu * (1 - 4 * nu * mu2) / 2.0 / nu / np.power(mu2, 1.5) - g2 = -6 * mu ** 4 * nu + (8 * nu - 2) * mu ** 2 - 2 * nu + 1 - g2 /= nu * mu2 ** 2.0 + mu = gam(nu+0.5)/gam(nu)/sqrt(nu) + mu2 = 1.0-mu*mu + g1 = mu * (1 - 4*nu*mu2) / 2.0 / nu / np.power(mu2, 1.5) + g2 = -6*mu**4*nu + (8*nu-2)*mu**2-2*nu + 1 + g2 /= nu*mu2**2.0 return mu, mu2, g1, g2 nakagami = nakagami_gen(a=0.0, name="nakagami") class ncx2_gen(rv_continuous): - """A non-central chi-squared continuous random variable. %(before_notes)s @@ -3356,17 +3551,20 @@ class ncx2_gen(rv_continuous): ----- The probability density function for `ncx2` is:: - ncx2.pdf(x, df, nc) = exp(-(nc+df)/2) * 1/2 * (x/nc)**((df-2)/4) + ncx2.pdf(x, df, nc) = exp(-(nc+x)/2) * 1/2 * (x/nc)**((df-2)/4) * I[(df-2)/2](sqrt(nc*x)) for ``x > 0``. + `ncx2` takes ``df`` and ``nc`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _rvs(self, df, nc): - return mtrand.noncentral_chisquare(df, nc, self._size) + return self._random_state.noncentral_chisquare(df, nc, self._size) def _logpdf(self, x, df, nc): return _ncx2_log_pdf(x, df, nc) @@ -3381,9 +3579,9 @@ class ncx2_gen(rv_continuous): return special.chndtrix(q, df, nc) def _stats(self, df, nc): - val = df + 2.0 * nc - return (df + nc, 2 * val, sqrt(8) * (val + nc) / val ** 1.5, - 12.0 * (val + 2 * nc) / val ** 2.0) + val = df + 2.0*nc + return (df + nc, 2*val, sqrt(8)*(val+nc)/val**1.5, + 12.0*(val+2*nc)/val**2.0) def _fitstart(self, data): m = data.mean() @@ -3396,7 +3594,6 @@ ncx2 = ncx2_gen(a=0.0, name='ncx2') class ncf_gen(rv_continuous): - """A non-central F distribution continuous random variable. %(before_notes)s @@ -3414,33 +3611,25 @@ class ncf_gen(rv_continuous): for ``df1, df2, nc > 0``. + `ncf` takes ``df1``, ``df2`` and ``nc`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _rvs(self, dfn, dfd, nc): - return mtrand.noncentral_f(dfn, dfd, nc, self._size) + return self._random_state.noncentral_f(dfn, dfd, nc, self._size) def _pdf_skip(self, x, dfn, dfd, nc): n1, n2 = dfn, dfd - term = -nc / 2 + nc * n1 * x / \ - (2 * (n2 + n1 * x)) + gamln(n1 / 2.) + gamln(1 + n2 / 2.) - term -= gamln((n1 + n2) / 2.0) + term = -nc/2+nc*n1*x/(2*(n2+n1*x)) + gamln(n1/2.)+gamln(1+n2/2.) + term -= gamln((n1+n2)/2.0) Px = exp(term) - Px *= n1 ** (n1 / 2) * n2 ** (n2 / 2) * x ** (n1 / 2 - 1) - Px *= (n2 + n1 * x) ** (-(n1 + n2) / 2) - Px *= special.assoc_laguerre(- - nc * - n1 * - x / - (2.0 * - (n2 + - n1 * - x)), n2 / - 2, n1 / - 2 - - 1) - Px /= special.beta(n1 / 2, n2 / 2) + Px *= n1**(n1/2) * n2**(n2/2) * x**(n1/2-1) + Px *= (n2+n1*x)**(-(n1+n2)/2) + Px *= special.assoc_laguerre(-nc*n1*x/(2.0*(n2+n1*x)), n2/2, n1/2-1) + Px /= special.beta(n1/2, n2/2) # This function does not have a return. Drop it for now, the generic # function seems to work OK. @@ -3451,23 +3640,22 @@ class ncf_gen(rv_continuous): return special.ncfdtri(dfn, dfd, nc, q) def _munp(self, n, dfn, dfd, nc): - val = (dfn * 1.0 / dfd) ** n - term = gamln(n + 0.5 * dfn) + gamln(0.5 * dfd - n) - gamln(dfd * 0.5) - val *= exp(-nc / 2.0 + term) - val *= special.hyp1f1(n + 0.5 * dfn, 0.5 * dfn, 0.5 * nc) + val = (dfn * 1.0/dfd)**n + term = gamln(n+0.5*dfn) + gamln(0.5*dfd-n) - gamln(dfd*0.5) + val *= exp(-nc / 2.0+term) + val *= special.hyp1f1(n+0.5*dfn, 0.5*dfn, 0.5*nc) return val def _stats(self, dfn, dfd, nc): - mu = where(dfd <= 2, inf, dfd / (dfd - 2.0) * (1 + nc * 1.0 / dfn)) - mu2 = where(dfd <= 4, inf, 2 * (dfd * 1.0 / dfn) ** 2.0 * - ((dfn + nc / 2.0) ** 2.0 + (dfn + nc) * (dfd - 2.0)) / - ((dfd - 2.0) ** 2.0 * (dfd - 4.0))) + mu = where(dfd <= 2, inf, dfd / (dfd-2.0)*(1+nc*1.0/dfn)) + mu2 = where(dfd <= 4, inf, 2*(dfd*1.0/dfn)**2.0 * + ((dfn+nc/2.0)**2.0 + (dfn+nc)*(dfd-2.0)) / + ((dfd-2.0)**2.0 * (dfd-4.0))) return mu, mu2, None, None ncf = ncf_gen(a=0.0, name='ncf') class t_gen(rv_continuous): - """A Student's T continuous random variable. %(before_notes)s @@ -3482,23 +3670,26 @@ class t_gen(rv_continuous): for ``df > 0``. + `t` takes ``df`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _rvs(self, df): - return mtrand.standard_t(df, size=self._size) + return self._random_state.standard_t(df, size=self._size) def _pdf(self, x, df): - r = asarray(df * 1.0) - Px = exp(gamln((r + 1) / 2) - gamln(r / 2)) - Px /= sqrt(r * pi) * (1 + (x ** 2) / r) ** ((r + 1) / 2) + r = asarray(df*1.0) + Px = exp(gamln((r+1)/2)-gamln(r/2)) + Px /= sqrt(r*pi)*(1+(x**2)/r)**((r+1)/2) return Px def _logpdf(self, x, df): - r = df * 1.0 - lPx = gamln((r + 1) / 2) - gamln(r / 2) - lPx -= 0.5 * log(r * pi) + (r + 1) / 2 * log(1 + (x ** 2) / r) + r = df*1.0 + lPx = gamln((r+1)/2)-gamln(r/2) + lPx -= 0.5*log(r*pi) + (r+1)/2*log(1+(x**2)/r) return lPx def _cdf(self, x, df): @@ -3514,15 +3705,18 @@ class t_gen(rv_continuous): return -special.stdtrit(df, q) def _stats(self, df): - mu2 = where(df > 2, df / (df - 2.0), inf) - g1 = where(df > 3, 0.0, nan) - g2 = where(df > 4, 6.0 / (df - 4.0), nan) + mu2 = _lazywhere(df > 2, (df,), + lambda df: df / (df-2.0), + np.inf) + g1 = where(df > 3, 0.0, np.nan) + g2 = _lazywhere(df > 4, (df,), + lambda df: 6.0 / (df-4.0), + np.nan) return 0, mu2, g1, g2 t = t_gen(name='t') class nct_gen(rv_continuous): - """A non-central Student's T continuous random variable. %(before_notes)s @@ -3531,39 +3725,43 @@ class nct_gen(rv_continuous): ----- The probability density function for `nct` is:: - df**(df/2) * gamma(df+1) - nct.pdf(x, df, nc) = ---------------------------------------------------- - 2**df*exp(nc**2/2) * (df+x**2)**(df/2) * gamma(df/2) + df**(df/2) * gamma(df+1) + nct.pdf(x, df, nc) = ---------------------------------------------------- + 2**df*exp(nc**2/2) * (df+x**2)**(df/2) * gamma(df/2) for ``df > 0``. + `nct` takes ``df`` and ``nc`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _argcheck(self, df, nc): return (df > 0) & (nc == nc) def _rvs(self, df, nc): - return (norm.rvs(loc=nc, size=self._size) * sqrt(df) / - sqrt(chi2.rvs(df, size=self._size))) + sz, rndm = self._size, self._random_state + n = norm.rvs(loc=nc, size=sz, random_state=rndm) + c2 = chi2.rvs(df, size=sz, random_state=rndm) + return n * sqrt(df) / sqrt(c2) def _pdf(self, x, df, nc): - n = df * 1.0 - nc = nc * 1.0 - x2 = x * x - ncx2 = nc * nc * x2 + n = df*1.0 + nc = nc*1.0 + x2 = x*x + ncx2 = nc*nc*x2 fac1 = n + x2 - trm1 = n / 2. * log(n) + gamln(n + 1) - trm1 -= n * log(2) + nc * nc / 2. + (n / 2.) * \ - log(fac1) + gamln(n / 2.) + trm1 = n/2.*log(n) + gamln(n+1) + trm1 -= n*log(2)+nc*nc/2.+(n/2.)*log(fac1)+gamln(n/2.) Px = exp(trm1) - valF = ncx2 / (2 * fac1) - trm1 = sqrt(2) * nc * x * special.hyp1f1(n / 2 + 1, 1.5, valF) - trm1 /= asarray(fac1 * special.gamma((n + 1) / 2)) - trm2 = special.hyp1f1((n + 1) / 2, 0.5, valF) - trm2 /= asarray(sqrt(fac1) * special.gamma(n / 2 + 1)) - Px *= trm1 + trm2 + valF = ncx2 / (2*fac1) + trm1 = sqrt(2)*nc*x*special.hyp1f1(n/2+1, 1.5, valF) + trm1 /= asarray(fac1*special.gamma((n+1)/2)) + trm2 = special.hyp1f1((n+1)/2, 0.5, valF) + trm2 /= asarray(sqrt(fac1)*special.gamma(n/2+1)) + Px *= trm1+trm2 return Px def _cdf(self, x, df, nc): @@ -3595,36 +3793,35 @@ class nct_gen(rv_continuous): # Biometrika 48, p. 465 (2961). # e.g. http://www.jstor.org/stable/2332772 (gated) # - g1 = g2 = None - - gfac = gam(df / 2. - 0.5) / gam(df / 2.) - c11 = sqrt(df / 2.) * gfac - c20 = df / (df - 2.) - c22 = c20 - c11 * c11 - mu = np.where(df > 1, nc * c11, np.inf) - mu2 = np.where(df > 2, c22 * nc * nc + c20, np.inf) + mu, mu2, g1, g2 = None, None, None, None + + gfac = gam(df/2.-0.5) / gam(df/2.) + c11 = sqrt(df/2.) * gfac + c20 = df / (df-2.) + c22 = c20 - c11*c11 + mu = np.where(df > 1, nc*c11, np.inf) + mu2 = np.where(df > 2, c22*nc*nc + c20, np.inf) if 's' in moments: - c33t = df * (7. - 2. * df) / (df - 2.) / (df - 3.) + 2. * c11 * c11 - c31t = 3. * df / (df - 2.) / (df - 3.) - mu3 = (c33t * nc * nc + c31t) * c11 * nc + c33t = df * (7.-2.*df) / (df-2.) / (df-3.) + 2.*c11*c11 + c31t = 3.*df / (df-2.) / (df-3.) + mu3 = (c33t*nc*nc + c31t) * c11*nc g1 = np.where(df > 3, mu3 / np.power(mu2, 1.5), np.nan) - # kurtosis + #kurtosis if 'k' in moments: - c44 = df * df / (df - 2.) / (df - 4.) - c44 -= c11 * c11 * 2. * df * (5. - df) / (df - 2.) / (df - 3.) - c44 -= 3. * c11 ** 4 - c42 = df / (df - 4.) - c11 * c11 * (df - 1.) / (df - 3.) - c42 *= 6. * df / (df - 2.) - c40 = 3. * df * df / (df - 2.) / (df - 4.) - - mu4 = c44 * nc ** 4 + c42 * nc ** 2 + c40 - g2 = np.where(df > 4, mu4 / mu2 ** 2 - 3., np.nan) + c44 = df*df / (df-2.) / (df-4.) + c44 -= c11*c11 * 2.*df*(5.-df) / (df-2.) / (df-3.) + c44 -= 3.*c11**4 + c42 = df / (df-4.) - c11*c11 * (df-1.) / (df-3.) + c42 *= 6.*df / (df-2.) + c40 = 3.*df*df / (df-2.) / (df-4.) + + mu4 = c44 * nc**4 + c42*nc**2 + c40 + g2 = np.where(df > 4, mu4/mu2**2 - 3., np.nan) return mu, mu2, g1, g2 nct = nct_gen(name="nct") class pareto_gen(rv_continuous): - """A Pareto continuous random variable. %(before_notes)s @@ -3637,18 +3834,21 @@ class pareto_gen(rv_continuous): for ``x >= 1``, ``b > 0``. + `pareto` takes ``b`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _pdf(self, x, b): - return b * x ** (-b - 1) + return b * x**(-b-1) def _cdf(self, x, b): - return 1 - x ** (-b) + return 1 - x**(-b) def _ppf(self, q, b): - return pow(1 - q, -1.0 / b) + return pow(1-q, -1.0/b) def _stats(self, b, moments='mv'): mu, mu2, g1, g2 = None, None, None, None @@ -3656,12 +3856,12 @@ class pareto_gen(rv_continuous): mask = b > 1 bt = extract(mask, b) mu = valarray(shape(b), value=inf) - place(mu, mask, bt / (bt - 1.0)) + place(mu, mask, bt / (bt-1.0)) if 'v' in moments: mask = b > 2 bt = extract(mask, b) mu2 = valarray(shape(b), value=inf) - place(mu2, mask, bt / (bt - 2.0) / (bt - 1.0) ** 2) + place(mu2, mask, bt / (bt-2.0) / (bt-1.0)**2) if 's' in moments: mask = b > 3 bt = extract(mask, b) @@ -3672,18 +3872,17 @@ class pareto_gen(rv_continuous): mask = b > 4 bt = extract(mask, b) g2 = valarray(shape(b), value=nan) - vals = (6.0 * polyval([1.0, 1.0, -6, -2], bt) / + vals = (6.0*polyval([1.0, 1.0, -6, -2], bt) / polyval([1.0, -7.0, 12.0, 0.0], bt)) place(g2, mask, vals) return mu, mu2, g1, g2 def _entropy(self, c): - return 1 + 1.0 / c - log(c) + return 1 + 1.0/c - log(c) pareto = pareto_gen(a=1.0, name="pareto") class lomax_gen(rv_continuous): - """A Lomax (Pareto of the second kind) continuous random variable. %(before_notes)s @@ -3699,39 +3898,41 @@ class lomax_gen(rv_continuous): for ``x >= 0``, ``c > 0``. + `lomax` takes ``c`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _pdf(self, x, c): - return c * 1.0 / (1.0 + x) ** (c + 1.0) + return c*1.0/(1.0+x)**(c+1.0) def _logpdf(self, x, c): - return log(c) - (c + 1) * log1p(x) + return log(c) - (c+1)*special.log1p(x) def _cdf(self, x, c): - return -expm1(-c * log1p(x)) + return -special.expm1(-c*special.log1p(x)) def _sf(self, x, c): - return exp(-c * log1p(x)) + return exp(-c*special.log1p(x)) def _logsf(self, x, c): - return -c * log1p(x) + return -c*special.log1p(x) def _ppf(self, q, c): - return expm1(-log1p(-q) / c) + return special.expm1(-special.log1p(-q)/c) def _stats(self, c): mu, mu2, g1, g2 = pareto.stats(c, loc=-1.0, moments='mvsk') return mu, mu2, g1, g2 def _entropy(self, c): - return 1 + 1.0 / c - log(c) + return 1+1.0/c-log(c) lomax = lomax_gen(a=0.0, name="lomax") class pearson3_gen(rv_continuous): - """A pearson type III continuous random variable. %(before_notes)s @@ -3749,6 +3950,10 @@ class pearson3_gen(rv_continuous): alpha = (stddev * beta)**2 zeta = loc - alpha / beta + `pearson3` takes ``skew`` as a shape parameter. + + %(after_notes)s + %(example)s References @@ -3764,7 +3969,6 @@ class pearson3_gen(rv_continuous): Aviation Loads Data", Office of Aviation Research (2003). """ - def _preprocess(self, x, skew): # The real 'loc' and 'scale' are handled in the calling pdf(...). The # local variables 'loc' and 'scale' within pearson3._pdf are set to @@ -3786,7 +3990,7 @@ class pearson3_gen(rv_continuous): invmask = ~mask beta = 2.0 / (skew[invmask] * scale) - alpha = (scale * beta) ** 2 + alpha = (scale * beta)**2 zeta = loc - alpha / beta transx = beta * (x[invmask] - zeta) @@ -3800,12 +4004,11 @@ class pearson3_gen(rv_continuous): return np.ones(np.shape(skew), dtype=bool) def _stats(self, skew): - # ans, x, transx, skew, mask, invmask, beta, alpha, zeta = ( - _1, _2, _3, skew, _4, _5, beta, alpha, zeta = self._preprocess([1], - skew) + ans, x, transx, skew, mask, invmask, beta, alpha, zeta = ( + self._preprocess([1], skew)) m = zeta + alpha / beta - v = alpha / (beta ** 2) - s = 2.0 / (alpha ** 0.5) * np.sign(beta) + v = alpha / (beta**2) + s = 2.0 / (alpha**0.5) * np.sign(beta) k = 6.0 / alpha return m, v, s, k @@ -3826,7 +4029,7 @@ class pearson3_gen(rv_continuous): # + (alpha - 1)*log(beta*(x - zeta)) + (a - 1)*log(x) # - beta*(x - zeta) - x # - gamln(alpha) - gamln(a) - ans, x, transx, skew, mask, invmask, beta, alpha, _zeta = ( + ans, x, transx, skew, mask, invmask, beta, alpha, zeta = ( self._preprocess(x, skew)) ans[mask] = np.log(_norm_pdf(x[mask])) @@ -3834,7 +4037,7 @@ class pearson3_gen(rv_continuous): return ans def _cdf(self, x, skew): - ans, x, transx, skew, mask, invmask, _beta, alpha, _zeta = ( + ans, x, transx, skew, mask, invmask, beta, alpha, zeta = ( self._preprocess(x, skew)) ans[mask] = _norm_cdf(x[mask]) @@ -3842,26 +4045,25 @@ class pearson3_gen(rv_continuous): return ans def _rvs(self, skew): - _ans, _x, _transx, skew, mask, _invmask, beta, alpha, zeta = ( + ans, x, transx, skew, mask, invmask, beta, alpha, zeta = ( self._preprocess([0], skew)) if mask[0]: - return mtrand.standard_normal(self._size) - ans = mtrand.standard_gamma(alpha, self._size) / beta + zeta + return self._random_state.standard_normal(self._size) + ans = self._random_state.standard_gamma(alpha, self._size)/beta + zeta if ans.size == 1: return ans[0] return ans def _ppf(self, q, skew): - ans, q, _transq, skew, mask, invmask, beta, alpha, zeta = ( + ans, q, transq, skew, mask, invmask, beta, alpha, zeta = ( self._preprocess(q, skew)) ans[mask] = _norm_ppf(q[mask]) - ans[invmask] = special.gammaincinv(alpha, q[invmask]) / beta + zeta + ans[invmask] = special.gammaincinv(alpha, q[invmask])/beta + zeta return ans pearson3 = pearson3_gen(name="pearson3") class powerlaw_gen(rv_continuous): - """A power-function continuous random variable. %(before_notes)s @@ -3874,26 +4076,29 @@ class powerlaw_gen(rv_continuous): for ``0 <= x <= 1``, ``a > 0``. + `powerlaw` takes ``a`` as a shape parameter. + + %(after_notes)s + `powerlaw` is a special case of `beta` with ``b == 1``. %(example)s """ - def _pdf(self, x, a): - return a * x ** (a - 1.0) + return a*x**(a-1.0) def _logpdf(self, x, a): return log(a) + special.xlogy(a - 1, x) def _cdf(self, x, a): - return x ** (a * 1.0) + return x**(a*1.0) def _logcdf(self, x, a): - return a * log(x) + return a*log(x) def _ppf(self, q, a): - return pow(q, 1.0 / a) + return pow(q, 1.0/a) def _stats(self, a): return (a / (a + 1.0), @@ -3902,12 +4107,11 @@ class powerlaw_gen(rv_continuous): 6 * polyval([1, -1, -6, 2], a) / (a * (a + 3.0) * (a + 4))) def _entropy(self, a): - return 1 - 1.0 / a - log(a) + return 1 - 1.0/a - log(a) powerlaw = powerlaw_gen(a=0.0, b=1.0, name="powerlaw") class powerlognorm_gen(rv_continuous): - """A power log-normal continuous random variable. %(before_notes)s @@ -3922,16 +4126,19 @@ class powerlognorm_gen(rv_continuous): where ``phi`` is the normal pdf, and ``Phi`` is the normal cdf, and ``x > 0``, ``s, c > 0``. + `powerlognorm` takes ``c`` and ``s`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _pdf(self, x, c, s): - return (c / (x * s) * _norm_pdf(log(x) / s) * - pow(_norm_cdf(-log(x) / s), c * 1.0 - 1.0)) + return (c/(x*s) * _norm_pdf(log(x)/s) * + pow(_norm_cdf(-log(x)/s), c*1.0-1.0)) def _cdf(self, x, c, s): - return 1.0 - pow(_norm_cdf(-log(x) / s), c * 1.0) + return 1.0 - pow(_norm_cdf(-log(x)/s), c*1.0) def _ppf(self, q, c, s): return exp(-s * _norm_ppf(pow(1.0 - q, 1.0 / c))) @@ -3939,7 +4146,6 @@ powerlognorm = powerlognorm_gen(a=0.0, name="powerlognorm") class powernorm_gen(rv_continuous): - """A power normal continuous random variable. %(before_notes)s @@ -3953,18 +4159,21 @@ class powernorm_gen(rv_continuous): where ``phi`` is the normal pdf, and ``Phi`` is the normal cdf, and ``x > 0``, ``c > 0``. + `powernorm` takes ``c`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _pdf(self, x, c): - return (c * _norm_pdf(x) * (_norm_cdf(-x) ** (c - 1.0))) + return (c*_norm_pdf(x) * (_norm_cdf(-x)**(c-1.0))) def _logpdf(self, x, c): - return log(c) + _norm_logpdf(x) + (c - 1) * _norm_logcdf(-x) + return log(c) + _norm_logpdf(x) + (c-1)*_norm_logcdf(-x) def _cdf(self, x, c): - return 1.0 - _norm_cdf(-x) ** (c * 1.0) + return 1.0-_norm_cdf(-x)**(c*1.0) def _ppf(self, q, c): return -_norm_ppf(pow(1.0 - q, 1.0 / c)) @@ -3972,7 +4181,6 @@ powernorm = powernorm_gen(name='powernorm') class rdist_gen(rv_continuous): - """An R-distributed continuous random variable. %(before_notes)s @@ -3985,21 +4193,23 @@ class rdist_gen(rv_continuous): for ``-1 <= x <= 1``, ``c > 0``. + `rdist` takes ``c`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _pdf(self, x, c): - return np.power((1.0 - x ** 2), c / 2.0 - 1) / \ - special.beta(0.5, c / 2.0) + return np.power((1.0 - x**2), c / 2.0 - 1) / special.beta(0.5, c / 2.0) def _cdf(self, x, c): term1 = x / special.beta(0.5, c / 2.0) - res = 0.5 + term1 * special.hyp2f1(0.5, 1 - c / 2.0, 1.5, x ** 2) + res = 0.5 + term1 * special.hyp2f1(0.5, 1 - c / 2.0, 1.5, x**2) # There's an issue with hyp2f1, it returns nans near x = +-1, c > 100. # Use the generic implementation in that case. See gh-1285 for # background. - if any(np.isnan(res)): + if np.any(np.isnan(res)): return rv_continuous._cdf(self, x, c) return res @@ -4010,7 +4220,6 @@ rdist = rdist_gen(a=-1.0, b=1.0, name="rdist") class rayleigh_gen(rv_continuous): - """A Rayleigh continuous random variable. %(before_notes)s @@ -4025,6 +4234,8 @@ class rayleigh_gen(rv_continuous): `rayleigh` is a special case of `chi` with ``df == 2``. + %(after_notes)s + %(example)s """ @@ -4038,7 +4249,7 @@ class rayleigh_gen(rv_continuous): raise IndexError('Index to the fixed parameter is out of bounds') def _rvs(self): - return chi.rvs(2, size=self._size) + return chi.rvs(2, size=self._size, random_state=self._random_state) def _pdf(self, r): return exp(self._logpdf(r)) @@ -4048,24 +4259,24 @@ class rayleigh_gen(rv_continuous): return where(rr2 == inf, - rr2, log(r) - rr2) def _cdf(self, r): - return - expm1(-r * r / 2.0) - - def _sf(self, r): - return exp(-r * r / 2.0) + return -special.expm1(-0.5 * r**2) def _ppf(self, q): - return sqrt(-2 * log1p(-q)) + return sqrt(-2 * special.log1p(-q)) + + def _sf(self, r): + return exp(-0.5 * r**2) def _isf(self, q): return sqrt(-2 * log(q)) def _stats(self): val = 4 - pi - return (np.sqrt(pi / 2), val / 2, 2 * (pi - 3) * sqrt(pi) / val ** 1.5, - 6 * pi / val - 16 / val ** 2) + return (np.sqrt(pi/2), val/2, 2*(pi-3)*sqrt(pi)/val**1.5, + 6*pi/val-16/val**2) def _entropy(self): - return _EULER / 2.0 + 1 - 0.5 * log(2) + return _EULER/2.0 + 1 - 0.5*log(2) rayleigh = rayleigh_gen(a=0.0, name="rayleigh") @@ -4146,7 +4357,6 @@ truncrayleigh = truncrayleigh_gen(a=0.0, name="truncrayleigh", shapes='c') class reciprocal_gen(rv_continuous): - """A reciprocal continuous random variable. %(before_notes)s @@ -4159,14 +4369,17 @@ class reciprocal_gen(rv_continuous): for ``a <= x <= b``, ``a, b > 0``. + `reciprocal` takes ``a`` and ``b`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _argcheck(self, a, b): self.a = a self.b = b - self.d = log(b * 1.0 / a) + self.d = log(b*1.0 / a) return (a > 0) & (b > 0) & (b > a) def _pdf(self, x, a, b): @@ -4176,13 +4389,13 @@ class reciprocal_gen(rv_continuous): return -log(x) - log(self.d) def _cdf(self, x, a, b): - return (log(x) - log(a)) / self.d + return (log(x)-log(a)) / self.d def _ppf(self, q, a, b): - return a * pow(b * 1.0 / a, q) + return a*pow(b*1.0/a, q) def _munp(self, n, a, b): - return 1.0 / self.d / n * (pow(b * 1.0, n) - pow(a * 1.0, n)) + return 1.0/self.d / n * (pow(b*1.0, n) - pow(a*1.0, n)) def _fitstart(self, data): a = np.min(data) @@ -4196,13 +4409,10 @@ class reciprocal_gen(rv_continuous): return super(reciprocal_gen, self)._fitstart(data, args=(a, b)) def _entropy(self, a, b): - return 0.5 * log(a * b) + log(log(b / a)) + return 0.5*log(a*b)+log(log(b/a)) reciprocal = reciprocal_gen(name="reciprocal") - -# FIXME: PPF does not work. class rice_gen(rv_continuous): - """A Rice continuous random variable. %(before_notes)s @@ -4215,34 +4425,52 @@ class rice_gen(rv_continuous): for ``x > 0``, ``b > 0``. + `rice` takes ``b`` as a shape parameter. + + %(after_notes)s + + The Rice distribution describes the length, ``r``, of a 2-D vector + with components ``(U+u, V+v)``, where ``U, V`` are constant, ``u, v`` + are independent Gaussian random variables with standard deviation + ``s``. Let ``R = (U**2 + V**2)**0.5``. Then the pdf of ``r`` is + ``rice.pdf(x, R/s, scale=s)``. + %(example)s """ - def _argcheck(self, b): return b >= 0 def _rvs(self, b): # http://en.wikipedia.org/wiki/Rice_distribution sz = self._size if self._size else 1 - t = b / np.sqrt(2) + mtrand.standard_normal(size=(2, sz)) - return np.sqrt((t * t).sum(axis=0)) + t = b/np.sqrt(2) + self._random_state.standard_normal(size=(2, sz)) + return np.sqrt((t*t).sum(axis=0)) + + def _cdf(self, x, b): + return chndtr(np.square(x), 2, np.square(b)) + + def _ppf(self, q, b): + return np.sqrt(chndtrix(q, 2, np.square(b))) def _pdf(self, x, b): - return x * exp(-(x - b) * (x - b) / 2.0) * special.i0e(x * b) + # We use (x**2 + b**2)/2 = ((x-b)**2)/2 + xb. + # The factor of exp(-xb) is then included in the i0e function + # in place of the modified Bessel function, i0, improving + # numerical stability for large values of xb. + return x * exp(-(x-b)*(x-b)/2.0) * special.i0e(x*b) def _munp(self, n, b): - nd2 = n / 2.0 + nd2 = n/2.0 n1 = 1 + nd2 - b2 = b * b / 2.0 - return (2.0 ** (nd2) * exp(-b2) * special.gamma(n1) * + b2 = b*b/2.0 + return (2.0**(nd2) * exp(-b2) * special.gamma(n1) * special.hyp1f1(n1, 1, b2)) rice = rice_gen(a=0.0, name="rice") # FIXME: PPF does not work. class recipinvgauss_gen(rv_continuous): - """A reciprocal inverse Gaussian continuous random variable. %(before_notes)s @@ -4255,32 +4483,31 @@ class recipinvgauss_gen(rv_continuous): for ``x >= 0``. + `recipinvgauss` takes ``mu`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _rvs(self, mu): - return 1.0 / mtrand.wald(mu, 1.0, size=self._size) + return 1.0/self._random_state.wald(mu, 1.0, size=self._size) def _pdf(self, x, mu): - return 1.0 / sqrt(2 * pi * x) * \ - exp(-(1 - mu * x) ** 2.0 / (2 * x * mu ** 2.0)) + return 1.0/sqrt(2*pi*x)*exp(-(1-mu*x)**2.0 / (2*x*mu**2.0)) def _logpdf(self, x, mu): - return -(1 - mu * x) ** 2.0 / \ - (2 * x * mu ** 2.0) - 0.5 * log(2 * pi * x) + return -(1-mu*x)**2.0 / (2*x*mu**2.0) - 0.5*log(2*pi*x) def _cdf(self, x, mu): - trm1 = 1.0 / mu - x - trm2 = 1.0 / mu + x - isqx = 1.0 / sqrt(x) - return 1.0 - _norm_cdf(isqx * trm1) - \ - exp(2.0 / mu) * _norm_cdf(-isqx * trm2) + trm1 = 1.0/mu - x + trm2 = 1.0/mu + x + isqx = 1.0/sqrt(x) + return 1.0-_norm_cdf(isqx*trm1)-exp(2.0/mu)*_norm_cdf(-isqx*trm2) recipinvgauss = recipinvgauss_gen(a=0.0, name='recipinvgauss') class semicircular_gen(rv_continuous): - """A semicircular continuous random variable. %(before_notes)s @@ -4293,15 +4520,16 @@ class semicircular_gen(rv_continuous): for ``-1 <= x <= 1``. + %(after_notes)s + %(example)s """ - def _pdf(self, x): - return 2.0 / pi * sqrt(1 - x * x) + return 2.0/pi*sqrt(1-x*x) def _cdf(self, x): - return 0.5 + 1.0 / pi * (x * sqrt(1 - x * x) + arcsin(x)) + return 0.5+1.0/pi*(x*sqrt(1-x*x) + arcsin(x)) def _stats(self): return 0, 0.25, 0, -1.0 @@ -4311,8 +4539,65 @@ class semicircular_gen(rv_continuous): semicircular = semicircular_gen(a=-1.0, b=1.0, name="semicircular") -class triang_gen(rv_continuous): +class skew_norm_gen(rv_continuous): + """A skew-normal random variable. + + %(before_notes)s + + Notes + ----- + The pdf is + + skewnorm.pdf(x, a) = 2*norm.pdf(x)*norm.cdf(ax) + + `skewnorm` takes ``a`` as a skewness parameter + When a=0 the distribution is identical to a normal distribution. + rvs implements the method of [1]. + + %(after_notes)s + + %(example)s + + References + ---------- + + [1] A. Azzalini and A. Capitanio (1999). Statistical applications of the + multivariate skew-normal distribution. J. Roy. Statist. Soc., B 61, 579-602. + http://azzalini.stat.unipd.it/SN/faq-r.html + """ + + def _argcheck(self, a): + return np.isfinite(a) + + def _pdf(self, x, a): + return 2.*_norm_pdf(x)*_norm_cdf(a*x) + + def _rvs(self, a): + u0 = self._random_state.normal(size=self._size) + v = self._random_state.normal(size=self._size) + d = a/np.sqrt(1 + a**2) + u1 = d*u0 + v*np.sqrt(1 - d**2) + return np.where(u0 >= 0, u1, -u1) + + def _stats(self, a, moments='mvsk'): + output = [None, None, None, None] + const = np.sqrt(2/pi) * a/np.sqrt(1 + a**2) + + if 'm' in moments: + output[0] = const + if 'v' in moments: + output[1] = 1 - const**2 + if 's' in moments: + output[2] = ((4 - pi)/2) * (const/np.sqrt(1 - const**2))**3 + if 'k' in moments: + output[3] = (2*(pi - 3)) * (const**4/(1 - const**2)**2) + + return output + +skewnorm = skew_norm_gen(name='skewnorm') + +class triang_gen(rv_continuous): """A triangular continuous random variable. %(before_notes)s @@ -4323,6 +4608,10 @@ class triang_gen(rv_continuous): ``loc`` to ``(loc + c*scale)`` and then downsloping for ``(loc + c*scale)`` to ``(loc+scale)``. + `triang` takes ``c`` as a shape parameter. + + %(after_notes)s + The standard form is in the range [0, 1] with c the mode. The location parameter shifts the start to `loc`. The scale parameter changes the width from 1 to `scale`. @@ -4330,33 +4619,31 @@ class triang_gen(rv_continuous): %(example)s """ - def _rvs(self, c): - return mtrand.triangular(0, c, 1, self._size) + return self._random_state.triangular(0, c, 1, self._size) def _argcheck(self, c): return (c >= 0) & (c <= 1) def _pdf(self, x, c): - return where(x < c, 2 * x / c, 2 * (1 - x) / (1 - c)) + return where(x < c, 2*x/c, 2*(1-x)/(1-c)) def _cdf(self, x, c): - return where(x < c, x * x / c, (x * x - 2 * x + c) / (c - 1)) + return where(x < c, x*x/c, (x*x-2*x+c)/(c-1)) def _ppf(self, q, c): - return where(q < c, sqrt(c * q), 1 - sqrt((1 - c) * (1 - q))) + return where(q < c, sqrt(c*q), 1-sqrt((1-c)*(1-q))) def _stats(self, c): - return (c + 1.0) / 3.0, (1.0 - c + c * c) / 18, sqrt(2) * (2 * c - 1) * (c + 1) * (c - 2) / \ - (5 * np.power((1.0 - c + c * c), 1.5)), -3.0 / 5.0 + return (c+1.0)/3.0, (1.0-c+c*c)/18, sqrt(2)*(2*c-1)*(c+1)*(c-2) / \ + (5 * np.power((1.0-c+c*c), 1.5)), -3.0/5.0 def _entropy(self, c): - return 0.5 - log(2) + return 0.5-log(2) triang = triang_gen(a=0.0, b=1.0, name="triang") class truncexpon_gen(rv_continuous): - """A truncated exponential continuous random variable. %(before_notes)s @@ -4369,33 +4656,36 @@ class truncexpon_gen(rv_continuous): for ``0 < x < b``. + `truncexpon` takes ``b`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _argcheck(self, b): self.b = b return (b > 0) def _pdf(self, x, b): - return exp(-x) / (-expm1(-b)) + return exp(-x)/(-special.expm1(-b)) def _logpdf(self, x, b): - return - x - log(-expm1(-b)) + return -x - log(-special.expm1(-b)) def _cdf(self, x, b): - return expm1(-x) / expm1(-b) + return special.expm1(-x)/special.expm1(-b) def _ppf(self, q, b): - return - log1p(q * expm1(-b)) + return -special.log1p(q*special.expm1(-b)) def _munp(self, n, b): # wrong answer with formula, same as in continuous.pdf # return gam(n+1)-special.gammainc(1+n, b) if n == 1: - return (1 - (b + 1) * exp(-b)) / (-expm1(-b)) + return (1-(b+1)*exp(-b))/(-special.expm1(-b)) elif n == 2: - return 2 * (1 - 0.5 * (b * b + 2 * b + 2) * exp(-b)) / (-expm1(-b)) + return 2*(1-0.5*(b*b+2*b+2)*exp(-b))/(-special.expm1(-b)) else: # return generic for higher moments # return rv_continuous._mom1_sc(self, n, b) @@ -4403,12 +4693,11 @@ class truncexpon_gen(rv_continuous): def _entropy(self, b): eB = exp(b) - return log(eB - 1) + (1 + eB * (b - 1.0)) / (1.0 - eB) + return log(eB-1)+(1+eB*(b-1.0))/(1.0-eB) truncexpon = truncexpon_gen(a=0.0, name='truncexpon') class truncnorm_gen(rv_continuous): - """A truncated normal continuous random variable. %(before_notes)s @@ -4422,10 +4711,13 @@ class truncnorm_gen(rv_continuous): a, b = (myclip_a - my_mean) / my_std, (myclip_b - my_mean) / my_std + `truncnorm` takes ``a`` and ``b`` as shape parameters. + + %(after_notes)s + %(example)s """ - def _argcheck(self, a, b): self.a = a self.b = b @@ -4451,23 +4743,22 @@ class truncnorm_gen(rv_continuous): def _ppf(self, q, a, b): if self.a > 0: - return _norm_isf(q * self._sb + self._sa * (1.0 - q)) + return _norm_isf(q*self._sb + self._sa*(1.0-q)) else: - return _norm_ppf(q * self._nb + self._na * (1.0 - q)) + return _norm_ppf(q*self._nb + self._na*(1.0-q)) def _stats(self, a, b): nA, nB = self._na, self._nb d = nB - nA pA, pB = _norm_pdf(a), _norm_pdf(b) mu = (pA - pB) / d # correction sign - mu2 = 1 + (a * pA - b * pB) / d - mu * mu + mu2 = 1 + (a*pA - b*pB) / d - mu*mu return mu, mu2, None, None truncnorm = truncnorm_gen(name='truncnorm') # FIXME: RVS does not work. class tukeylambda_gen(rv_continuous): - """A Tukey-Lamdba continuous random variable. %(before_notes)s @@ -4483,40 +4774,39 @@ class tukeylambda_gen(rv_continuous): - u-shape (lam = 0.5) - uniform from -1 to 1 (lam = 1) + `tukeylambda` takes ``lam`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _argcheck(self, lam): return np.ones(np.shape(lam), dtype=bool) def _pdf(self, x, lam): Fx = asarray(special.tklmbda(x, lam)) - Px = Fx ** (lam - 1.0) + (asarray(1 - Fx)) ** (lam - 1.0) - Px = 1.0 / asarray(Px) - return where((lam <= 0) | (abs(x) < 1.0 / asarray(lam)), Px, 0.0) + Px = Fx**(lam-1.0) + (asarray(1-Fx))**(lam-1.0) + Px = 1.0/asarray(Px) + return where((lam <= 0) | (abs(x) < 1.0/asarray(lam)), Px, 0.0) def _cdf(self, x, lam): return special.tklmbda(x, lam) def _ppf(self, q, lam): - q = q * 1.0 - vals1 = (q ** lam - (1 - q) ** lam) / lam - vals2 = log(q / (1 - q)) - return where((lam == 0) & (q == q), vals2, vals1) + return special.boxcox(q, lam) - special.boxcox1p(-q, lam) def _stats(self, lam): return 0, _tlvar(lam), 0, _tlkurt(lam) def _entropy(self, lam): def integ(p): - return log(pow(p, lam - 1) + pow(1 - p, lam - 1)) + return log(pow(p, lam-1)+pow(1-p, lam-1)) return integrate.quad(integ, 0, 1)[0] tukeylambda = tukeylambda_gen(name='tukeylambda') class uniform_gen(rv_continuous): - """A uniform continuous random variable. This distribution is constant between `loc` and ``loc + scale``. @@ -4526,12 +4816,11 @@ class uniform_gen(rv_continuous): %(example)s """ - def _rvs(self): - return mtrand.uniform(0.0, 1.0, self._size) + return self._random_state.uniform(0.0, 1.0, self._size) def _pdf(self, x): - return 1.0 * (x == x) + return 1.0*(x == x) def _cdf(self, x): return x @@ -4540,7 +4829,7 @@ class uniform_gen(rv_continuous): return q def _stats(self): - return 0.5, 1.0 / 12, 0, -1.2 + return 0.5, 1.0/12, 0, -1.2 def _entropy(self): return 0.0 @@ -4548,7 +4837,6 @@ uniform = uniform_gen(a=0.0, b=1.0, name='uniform') class vonmises_gen(rv_continuous): - """A Von Mises continuous random variable. %(before_notes)s @@ -4564,6 +4852,10 @@ class vonmises_gen(rv_continuous): for ``-pi <= x <= pi``, ``kappa > 0``. + `vonmises` takes ``kappa`` as a shape parameter. + + %(after_notes)s + See Also -------- vonmises_line : The same distribution, defined on a [-pi, pi] segment @@ -4572,12 +4864,11 @@ class vonmises_gen(rv_continuous): %(example)s """ - def _rvs(self, kappa): - return mtrand.vonmises(0.0, kappa, size=self._size) + return self._random_state.vonmises(0.0, kappa, size=self._size) def _pdf(self, x, kappa): - return exp(kappa * cos(x)) / (2 * pi * special.i0(kappa)) + return exp(kappa * cos(x)) / (2*pi*special.i0(kappa)) def _cdf(self, x, kappa): return vonmises_cython.von_mises_cdf(kappa, x) @@ -4589,7 +4880,6 @@ vonmises_line = vonmises_gen(a=-np.pi, b=np.pi, name='vonmises_line') class wald_gen(invgauss_gen): - """A Wald continuous random variable. %(before_notes)s @@ -4604,11 +4894,12 @@ class wald_gen(invgauss_gen): `wald` is a special case of `invgauss` with ``mu == 1``. + %(after_notes)s + %(example)s """ - def _rvs(self): - return mtrand.wald(1.0, 1.0, size=self._size) + return self._random_state.wald(1.0, 1.0, size=self._size) def _pdf(self, x): return invgauss._pdf(x, 1.0) @@ -4625,7 +4916,6 @@ wald = wald_gen(a=0.0, name="wald") class wrapcauchy_gen(rv_continuous): - """A wrapped Cauchy continuous random variable. %(before_notes)s @@ -4638,45 +4928,169 @@ class wrapcauchy_gen(rv_continuous): for ``0 <= x <= 2*pi``, ``0 < c < 1``. + `wrapcauchy` takes ``c`` as a shape parameter. + + %(after_notes)s + %(example)s """ - def _argcheck(self, c): return (c > 0) & (c < 1) def _pdf(self, x, c): - return (1.0 - c * c) / (2 * pi * (1 + c * c - 2 * c * cos(x))) + return (1.0-c*c)/(2*pi*(1+c*c-2*c*cos(x))) def _cdf(self, x, c): - output = 0.0 * x - val = (1.0 + c) / (1.0 - c) + output = np.zeros(x.shape, dtype=x.dtype) + val = (1.0+c)/(1.0-c) c1 = x < pi - c2 = 1 - c1 + c2 = 1-c1 xp = extract(c1, x) xn = extract(c2, x) - if (any(xn)): - valn = extract(c2, np.ones_like(x) * val) - xn = 2 * pi - xn - yn = tan(xn / 2.0) - on = 1.0 - 1.0 / pi * arctan(valn * yn) + if np.any(xn): + valn = extract(c2, np.ones_like(x)*val) + xn = 2*pi - xn + yn = tan(xn/2.0) + on = 1.0-1.0/pi*arctan(valn*yn) place(output, c2, on) - if (any(xp)): - valp = extract(c1, np.ones_like(x) * val) - yp = tan(xp / 2.0) - op = 1.0 / pi * arctan(valp * yp) + if np.any(xp): + valp = extract(c1, np.ones_like(x)*val) + yp = tan(xp/2.0) + op = 1.0/pi*arctan(valp*yp) place(output, c1, op) return output def _ppf(self, q, c): - val = (1.0 - c) / (1.0 + c) - rcq = 2 * arctan(val * tan(pi * q)) - rcmq = 2 * pi - 2 * arctan(val * tan(pi * (1 - q))) - return where(q < 1.0 / 2, rcq, rcmq) + val = (1.0-c)/(1.0+c) + rcq = 2*arctan(val*tan(pi*q)) + rcmq = 2*pi-2*arctan(val*tan(pi*(1-q))) + return where(q < 1.0/2, rcq, rcmq) def _entropy(self, c): - return log(2 * pi * (1 - c * c)) -wrapcauchy = wrapcauchy_gen(a=0.0, b=2 * pi, name='wrapcauchy') + return log(2*pi*(1-c*c)) +wrapcauchy = wrapcauchy_gen(a=0.0, b=2*pi, name='wrapcauchy') + + +class gennorm_gen(rv_continuous): + """A generalized normal continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `gennorm` is [1]_:: + + beta + gennorm.pdf(x, beta) = --------------- exp(-|x|**beta) + 2 gamma(1/beta) + + `gennorm` takes ``beta`` as a shape parameter. + For ``beta = 1``, it is identical to a Laplace distribution. + For ``beta = 2``, it is identical to a normal distribution + (with ``scale=1/sqrt(2)``). + + See Also + -------- + laplace : Laplace distribution + norm : normal distribution + + References + ---------- + + .. [1] "Generalized normal distribution, Version 1", + https://en.wikipedia.org/wiki/Generalized_normal_distribution#Version_1 + + %(example)s + + """ + + def _pdf(self, x, beta): + return np.exp(self._logpdf(x, beta)) + + def _logpdf(self, x, beta): + return np.log(.5 * beta) - special.gammaln(1. / beta) - abs(x)**beta + + def _cdf(self, x, beta): + c = .5 * np.sign(x) + # evaluating (.5 + c) first prevents numerical cancellation + return (.5 + c) - c * special.gammaincc(1. / beta, abs(x)**beta) + + def _ppf(self, x, beta): + c = np.sign(x - .5) + # evaluating (1. + c) first prevents numerical cancellation + return c * special.gammainccinv(1. / beta, (1. + c) - 2.*c*x)**(1. / beta) + + def _sf(self, x, beta): + return self._cdf(-x, beta) + + def _isf(self, x, beta): + return -self._ppf(x, beta) + + def _stats(self, beta): + c1, c3, c5 = special.gammaln([1./beta, 3./beta, 5./beta]) + return 0., np.exp(c3 - c1), 0., np.exp(c5 + c1 - 2. * c3) - 3. + + def _entropy(self, beta): + return 1. / beta - np.log(.5 * beta) + special.gammaln(1. / beta) +gennorm = gennorm_gen(name='gennorm') + + +class halfgennorm_gen(rv_continuous): + """The upper half of a generalized normal continuous random variable. + + %(before_notes)s + + Notes + ----- + The probability density function for `halfgennorm` is:: + + beta + halfgennorm.pdf(x, beta) = ------------- exp(-|x|**beta) + gamma(1/beta) + + `gennorm` takes ``beta`` as a shape parameter. + For ``beta = 1``, it is identical to an exponential distribution. + For ``beta = 2``, it is identical to a half normal distribution + (with ``scale=1/sqrt(2)``). + + See Also + -------- + gennorm : generalized normal distribution + expon : exponential distribution + halfnorm : half normal distribution + + References + ---------- + + .. [1] "Generalized normal distribution, Version 1", + https://en.wikipedia.org/wiki/Generalized_normal_distribution#Version_1 + + %(example)s + + """ + + def _pdf(self, x, beta): + return np.exp(self._logpdf(x, beta)) + + def _logpdf(self, x, beta): + return np.log(beta) - special.gammaln(1. / beta) - x**beta + + def _cdf(self, x, beta): + return special.gammainc(1. / beta, x**beta) + + def _ppf(self, x, beta): + return special.gammaincinv(1. / beta, x)**(1. / beta) + + def _sf(self, x, beta): + return special.gammaincc(1. / beta, x**beta) + + def _isf(self, x, beta): + return special.gammainccinv(1. / beta, x)**(1. / beta) + + def _entropy(self, beta): + return 1. / beta - np.log(beta) + special.gammaln(1. / beta) +halfgennorm = halfgennorm_gen(a=0, name='halfgennorm') # Collect names of classes and objects in this module. diff --git a/wafo/stats/_discrete_distns.py b/wafo/stats/_discrete_distns.py index c442032..afa18a1 100644 --- a/wafo/stats/_discrete_distns.py +++ b/wafo/stats/_discrete_distns.py @@ -5,15 +5,15 @@ from __future__ import division, print_function, absolute_import from scipy import special -from scipy.special import gammaln as gamln +from scipy.special import entr, gammaln as gamln +from scipy.misc import logsumexp from numpy import floor, ceil, log, exp, sqrt, log1p, expm1, tanh, cosh, sinh import numpy as np -import numpy.random as mtrand -from ._distn_infrastructure import (rv_discrete, _lazywhere, _ncx2_pdf, - _ncx2_cdf, get_distribution_names) +from ._distn_infrastructure import ( + rv_discrete, _lazywhere, _ncx2_pdf, _ncx2_cdf, get_distribution_names) class binom_gen(rv_discrete): @@ -31,11 +31,13 @@ class binom_gen(rv_discrete): `binom` takes ``n`` and ``p`` as shape parameters. + %(after_notes)s + %(example)s """ def _rvs(self, n, p): - return mtrand.binomial(n, p, self._size) + return self._random_state.binomial(n, p, self._size) def _argcheck(self, n, p): self.b = n @@ -78,8 +80,7 @@ class binom_gen(rv_discrete): def _entropy(self, n, p): k = np.r_[0:n + 1] vals = self._pmf(k, n, p) - h = -np.sum(special.xlogy(vals, vals), axis=0) - return h + return np.sum(entr(vals), axis=0) binom = binom_gen(name='binom') @@ -99,6 +100,8 @@ class bernoulli_gen(binom_gen): `bernoulli` takes ``p`` as shape parameter. + %(after_notes)s + %(example)s """ @@ -127,8 +130,7 @@ class bernoulli_gen(binom_gen): return binom._stats(1, p) def _entropy(self, p): - h = -special.xlogy(p, p) - special.xlogy(1 - p, 1 - p) - return h + return entr(p) + entr(1-p) bernoulli = bernoulli_gen(b=1, name='bernoulli') @@ -147,21 +149,23 @@ class nbinom_gen(rv_discrete): `nbinom` takes ``n`` and ``p`` as shape parameters. + %(after_notes)s + %(example)s """ def _rvs(self, n, p): - return mtrand.negative_binomial(n, p, self._size) + return self._random_state.negative_binomial(n, p, self._size) def _argcheck(self, n, p): - return (n >= 0) & (p >= 0) & (p <= 1) + return (n > 0) & (p >= 0) & (p <= 1) def _pmf(self, x, n, p): return exp(self._logpmf(x, n, p)) def _logpmf(self, x, n, p): - coeff = gamln(n + x) - gamln(x + 1) - gamln(n) - return coeff + special.xlogy(n, p) + special.xlog1py(x, -p) + coeff = gamln(n+x) - gamln(x+1) - gamln(n) + return coeff + n*log(p) + special.xlog1py(x, -p) def _cdf(self, x, n, p): k = floor(x) @@ -204,11 +208,13 @@ class geom_gen(rv_discrete): `geom` takes ``p`` as shape parameter. + %(after_notes)s + %(example)s """ def _rvs(self, p): - return mtrand.geometric(p, size=self._size) + return self._random_state.geometric(p, size=self._size) def _argcheck(self, p): return (p <= 1) & (p >= 0) @@ -221,14 +227,14 @@ class geom_gen(rv_discrete): def _cdf(self, x, p): k = floor(x) - return -expm1(log1p(-p) * k) + return -expm1(log1p(-p)*k) def _sf(self, x, p): return np.exp(self._logsf(x, p)) def _logsf(self, x, p): k = floor(x) - return k * log1p(-p) + return k*log1p(-p) def _ppf(self, q, p): vals = ceil(log1p(-q) / log1p(-p)) @@ -262,6 +268,8 @@ class hypergeom_gen(rv_discrete): pmf(k, M, n, N) = choose(n, k) * choose(M - n, N - k) / choose(M, N), for max(0, N - (M-n)) <= k <= min(n, N) + %(after_notes)s + Examples -------- >>> from scipy.stats import hypergeom @@ -297,10 +305,10 @@ class hypergeom_gen(rv_discrete): """ def _rvs(self, M, n, N): - return mtrand.hypergeometric(n, M-n, N, size=self._size) + return self._random_state.hypergeometric(n, M-n, N, size=self._size) def _argcheck(self, M, n, N): - cond = rv_discrete._argcheck(self, M, n, N) + cond = (M > 0) & (n >= 0) & (N >= 0) cond &= (n <= M) & (N <= M) self.a = max(N-(M-n), 0) self.b = min(n, N) @@ -338,8 +346,7 @@ class hypergeom_gen(rv_discrete): def _entropy(self, M, n, N): k = np.r_[N - (M - n):min(n, N) + 1] vals = self.pmf(k, M, n, N) - h = -np.sum(special.xlogy(vals, vals), axis=0) - return h + return np.sum(entr(vals), axis=0) def _sf(self, k, M, n, N): """More precise calculation, 1 - cdf doesn't cut it.""" @@ -354,6 +361,17 @@ class hypergeom_gen(rv_discrete): k2 = np.arange(quant + 1, draw + 1) res.append(np.sum(self._pmf(k2, tot, good, draw))) return np.asarray(res) + + def _logsf(self, k, M, n, N): + """ + More precise calculation than log(sf) + """ + res = [] + for quant, tot, good, draw in zip(k, M, n, N): + # Integration over probability mass function using logsumexp + k2 = np.arange(quant + 1, draw + 1) + res.append(logsumexp(self._logpmf(k2, tot, good, draw))) + return np.asarray(res) hypergeom = hypergeom_gen(name='hypergeom') @@ -373,13 +391,15 @@ class logser_gen(rv_discrete): `logser` takes ``p`` as shape parameter. + %(after_notes)s + %(example)s """ def _rvs(self, p): # looks wrong for p>0.5, too few k=1 # trying to use generic is worse, no k=1 at all - return mtrand.logseries(p, size=self._size) + return self._random_state.logseries(p, size=self._size) def _argcheck(self, p): return (p > 0) & (p < 1) @@ -419,14 +439,21 @@ class poisson_gen(rv_discrete): `poisson` takes ``mu`` as shape parameter. + %(after_notes)s + %(example)s """ + + # Override rv_discrete._argcheck to allow mu=0. + def _argcheck(self, mu): + return mu >= 0 + def _rvs(self, mu): - return mtrand.poisson(mu, self._size) + return self._random_state.poisson(mu, self._size) def _logpmf(self, k, mu): - Pk = k*log(mu)-gamln(k+1) - mu + Pk = special.xlogy(k, mu) - gamln(k + 1) - mu return Pk def _pmf(self, k, mu): @@ -449,9 +476,11 @@ class poisson_gen(rv_discrete): def _stats(self, mu): var = mu tmp = np.asarray(mu) - g1 = sqrt(1.0 / tmp) - g2 = 1.0 / tmp + mu_nonzero = tmp > 0 + g1 = _lazywhere(mu_nonzero, (tmp,), lambda x: sqrt(1.0/x), np.inf) + g2 = _lazywhere(mu_nonzero, (tmp,), lambda x: 1.0/x, np.inf) return mu, var, g1, g2 + poisson = poisson_gen(name="poisson", longname='A Poisson') @@ -470,6 +499,8 @@ class planck_gen(rv_discrete): `planck` takes ``lambda_`` as shape parameter. + %(after_notes)s + %(example)s """ @@ -487,7 +518,7 @@ class planck_gen(rv_discrete): def _pmf(self, k, lambda_): fact = -expm1(-lambda_) - return fact * exp(-lambda_ * k) + return fact*exp(-lambda_*k) def _cdf(self, x, lambda_): k = floor(x) @@ -528,12 +559,14 @@ class boltzmann_gen(rv_discrete): `boltzmann` takes ``lambda_`` and ``N`` as shape parameters. + %(after_notes)s + %(example)s """ def _pmf(self, k, lambda_, N): fact = (expm1(-lambda_)) / (expm1(-lambda_ * N)) - return fact * exp(-lambda_ * k) + return fact*exp(-lambda_*k) def _cdf(self, x, lambda_, N): k = floor(x) @@ -559,7 +592,7 @@ class boltzmann_gen(rv_discrete): g2 = g2 / trm2 / trm2 return mu, var, g1, g2 boltzmann = boltzmann_gen(name='boltzmann', - longname='A truncated discrete exponential ') + longname='A truncated discrete exponential ') class randint_gen(rv_discrete): @@ -577,8 +610,7 @@ class randint_gen(rv_discrete): `randint` takes ``low`` and ``high`` as shape parameters. - Note the difference to the numpy ``random_integers`` which - returns integers on a *closed* interval ``[low, high]``. + %(after_notes)s %(example)s @@ -616,7 +648,7 @@ class randint_gen(rv_discrete): If ``high`` is ``None``, then range is >=0 and < low """ - return mtrand.randint(low, high, self._size) + return self._random_state.randint(low, high, self._size) def _entropy(self, low, high): return log(high - low) @@ -624,21 +656,6 @@ randint = randint_gen(name='randint', longname='A discrete uniform ' '(random integer)') -def harmonic(n, r): - return (1./n + special.polygamma(r-1, n)/special.gamma(r) + - special.zeta(r, 1)) - - -def H(n): - """Returns the n-th harmonic number. - - http://en.wikipedia.org/wiki/Harmonic_number - """ - # Euler-Mascheroni constant - gamma = 0.57721566490153286060651209008240243104215933593992 - return gamma + special.digamma(n+1) - - # FIXME: problems sampling. class zipf_gen(rv_discrete): """A Zipf discrete random variable. @@ -655,11 +672,13 @@ class zipf_gen(rv_discrete): `zipf` takes ``a`` as shape parameter. + %(after_notes)s + %(example)s """ def _rvs(self, a): - return mtrand.zipf(a, size=self._size) + return self._random_state.zipf(a, size=self._size) def _argcheck(self, a): return a > 1 @@ -691,6 +710,8 @@ class dlaplace_gen(rv_discrete): `dlaplace` takes ``a`` as shape parameter. + %(after_notes)s + %(example)s """ @@ -705,9 +726,8 @@ class dlaplace_gen(rv_discrete): def _ppf(self, q, a): const = 1 + exp(a) - vals = ceil(np.where(q < 1.0 / (1 + exp(-a)), - log(q*const) / a - 1, - -log((1-q) * const) / a)) + vals = ceil(np.where(q < 1.0 / (1 + exp(-a)), log(q*const) / a - 1, + -log((1-q) * const) / a)) vals1 = vals - 1 return np.where(self._cdf(vals1, a) >= q, vals1, vals) @@ -746,25 +766,28 @@ class skellam_gen(rv_discrete): `skellam` takes ``mu1`` and ``mu2`` as shape parameters. + %(after_notes)s + %(example)s """ def _rvs(self, mu1, mu2): n = self._size - return mtrand.poisson(mu1, n) - mtrand.poisson(mu2, n) + return (self._random_state.poisson(mu1, n) - + self._random_state.poisson(mu2, n)) def _pmf(self, x, mu1, mu2): px = np.where(x < 0, - _ncx2_pdf(2*mu2, 2*(1-x), 2*mu1)*2, - _ncx2_pdf(2*mu1, 2*(1+x), 2*mu2)*2) + _ncx2_pdf(2*mu2, 2*(1-x), 2*mu1)*2, + _ncx2_pdf(2*mu1, 2*(1+x), 2*mu2)*2) # ncx2.pdf() returns nan's for extremely low probabilities return px def _cdf(self, x, mu1, mu2): x = floor(x) px = np.where(x < 0, - _ncx2_cdf(2*mu2, -2*x, 2*mu1), - 1-_ncx2_cdf(2*mu1, 2*(x+1), 2*mu2)) + _ncx2_cdf(2*mu2, -2*x, 2*mu1), + 1-_ncx2_cdf(2*mu1, 2*(x+1), 2*mu2)) return px def _stats(self, mu1, mu2): diff --git a/wafo/stats/_distn_infrastructure.py b/wafo/stats/_distn_infrastructure.py index 3fb23b0..9677194 100644 --- a/wafo/stats/_distn_infrastructure.py +++ b/wafo/stats/_distn_infrastructure.py @@ -1,193 +1,10 @@ -# -# Author: Travis Oliphant 2002-2011 with contributions from -# SciPy Developers 2004-2011 -# -from __future__ import division, print_function, absolute_import - -from scipy._lib.six import string_types, exec_ - -import sys -import keyword -import re -import inspect -import types -import warnings - -from scipy.misc import doccer -from ._distr_params import distcont, distdiscrete - -from scipy.special import xlogy, chndtr, gammaln, hyp0f1, comb - -# for root finding for discrete distribution ppf, and max likelihood estimation -from scipy import optimize - -# for functions of continuous distributions (e.g. moments, entropy, cdf) -from scipy import integrate - -# to approximate the pdf of a continuous distribution given its cdf -from scipy.misc import derivative - -from numpy import (arange, putmask, ravel, take, ones, sum, shape, - product, reshape, zeros, floor, logical_and, log, sqrt, exp, - ndarray) - -from numpy import (place, any, argsort, argmax, vectorize, - asarray, nan, inf, isinf, NINF, empty) - +from scipy.stats._distn_infrastructure import * +from scipy.stats._distn_infrastructure import (_skew, _kurtosis, # @UnresolvedImport + _lazywhere, _ncx2_log_pdf, _ncx2_pdf, _ncx2_cdf) +from wafo.stats.estimation import FitDistribution +from wafo.stats._constants import _EPS, _XMAX import numpy as np -import numpy.random as mtrand -from ._constants import _EPS, _XMAX -from .estimation import FitDistribution - -try: - from new import instancemethod -except ImportError: - # Python 3 - def instancemethod(func, obj, cls): - return types.MethodType(func, obj) - - -# These are the docstring parts used for substitution in specific -# distribution docstrings - -docheaders = {'methods': """\nMethods\n-------\n""", - 'parameters': """\nParameters\n---------\n""", - 'notes': """\nNotes\n-----\n""", - 'examples': """\nExamples\n--------\n"""} - -_doc_rvs = """\ -``rvs(%(shapes)s, loc=0, scale=1, size=1)`` - Random variates. -""" -_doc_pdf = """\ -``pdf(x, %(shapes)s, loc=0, scale=1)`` - Probability density function. -""" -_doc_logpdf = """\ -``logpdf(x, %(shapes)s, loc=0, scale=1)`` - Log of the probability density function. -""" -_doc_pmf = """\ -``pmf(x, %(shapes)s, loc=0, scale=1)`` - Probability mass function. -""" -_doc_logpmf = """\ -``logpmf(x, %(shapes)s, loc=0, scale=1)`` - Log of the probability mass function. -""" -_doc_cdf = """\ -``cdf(x, %(shapes)s, loc=0, scale=1)`` - Cumulative density function. -""" -_doc_logcdf = """\ -``logcdf(x, %(shapes)s, loc=0, scale=1)`` - Log of the cumulative density function. -""" -_doc_sf = """\ -``sf(x, %(shapes)s, loc=0, scale=1)`` - Survival function (1-cdf --- sometimes more accurate). -""" -_doc_logsf = """\ -``logsf(x, %(shapes)s, loc=0, scale=1)`` - Log of the survival function. -""" -_doc_ppf = """\ -``ppf(q, %(shapes)s, loc=0, scale=1)`` - Percent point function (inverse of cdf --- percentiles). -""" -_doc_isf = """\ -``isf(q, %(shapes)s, loc=0, scale=1)`` - Inverse survival function (inverse of sf). -""" -_doc_moment = """\ -``moment(n, %(shapes)s, loc=0, scale=1)`` - Non-central moment of order n -""" -_doc_stats = """\ -``stats(%(shapes)s, loc=0, scale=1, moments='mv')`` - Mean('m'), variance('v'), skew('s'), and/or kurtosis('k'). -""" -_doc_entropy = """\ -``entropy(%(shapes)s, loc=0, scale=1)`` - (Differential) entropy of the RV. -""" -_doc_fit = """\ -``fit(data, %(shapes)s, loc=0, scale=1)`` - Parameter estimates for generic data. -""" -_doc_expect = """\ -``expect(func, %(shapes)s, loc=0, scale=1, lb=None, ub=None, conditional=False, **kwds)`` - Expected value of a function (of one argument) with respect to the distribution. -""" -_doc_expect_discrete = """\ -``expect(func, %(shapes)s, loc=0, lb=None, ub=None, conditional=False)`` - Expected value of a function (of one argument) with respect to the distribution. -""" -_doc_median = """\ -``median(%(shapes)s, loc=0, scale=1)`` - Median of the distribution. -""" -_doc_mean = """\ -``mean(%(shapes)s, loc=0, scale=1)`` - Mean of the distribution. -""" -_doc_var = """\ -``var(%(shapes)s, loc=0, scale=1)`` - Variance of the distribution. -""" -_doc_std = """\ -``std(%(shapes)s, loc=0, scale=1)`` - Standard deviation of the distribution. -""" -_doc_interval = """\ -``interval(alpha, %(shapes)s, loc=0, scale=1)`` - Endpoints of the range that contains alpha percent of the distribution -""" -_doc_allmethods = ''.join([docheaders['methods'], _doc_rvs, _doc_pdf, - _doc_logpdf, _doc_cdf, _doc_logcdf, _doc_sf, - _doc_logsf, _doc_ppf, _doc_isf, _doc_moment, - _doc_stats, _doc_entropy, _doc_fit, - _doc_expect, _doc_median, - _doc_mean, _doc_var, _doc_std, _doc_interval]) - -# Note that the two lines for %(shapes) are searched for and replaced in -# rv_continuous and rv_discrete - update there if the exact string changes -_doc_default_callparams = """ -Parameters ----------- -x : array_like - quantiles -q : array_like - lower or upper tail probability -%(shapes)s : array_like - shape parameters -loc : array_like, optional - location parameter (default=0) -scale : array_like, optional - scale parameter (default=1) -size : int or tuple of ints, optional - shape of random variates (default computed from input arguments ) -moments : str, optional - composed of letters ['mvsk'] specifying which moments to compute where - 'm' = mean, 'v' = variance, 's' = (Fisher's) skew and - 'k' = (Fisher's) kurtosis. - Default is 'mv'. -""" -_doc_default_longsummary = """\ -Continuous random variables are defined from a standard form and may -require some shape parameters to complete its specification. Any -optional keyword parameters can be passed to the methods of the RV -object as given below: -""" -_doc_default_frozen_note = """ -Alternatively, the object may be called (as a function) to fix the shape, -location, and scale parameters returning a "frozen" continuous RV object: - -rv = %(name)s(%(shapes)s, loc=0, scale=1) - - Frozen RV object with the same methods but holding the given shape, - location, and scale fixed. -""" _doc_default_example = """\ Examples -------- @@ -203,11 +20,15 @@ Calculate a few first moments: Display the probability density function (``pdf``): >>> x = np.linspace(%(name)s.ppf(0.01, %(shapes)s), -... %(name)s.ppf(0.99, %(shapes)s), 100) +... %(name)s.ppf(0.99, %(shapes)s), 100) >>> ax.plot(x, %(name)s.pdf(x, %(shapes)s), -... 'r-', lw=5, alpha=0.6, label='%(name)s pdf') +... 'r-', lw=5, alpha=0.6, label='%(name)s pdf') -Alternatively, freeze the distribution and display the frozen pdf: +Alternatively, the distribution object can be called (as a function) +to fix the shape, location and scale parameters. This returns a "frozen" +RV object holding the given parameters fixed. + +Freeze the distribution and display the frozen ``pdf``: >>> rv = %(name)s(%(shapes)s) >>> ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') @@ -229,8 +50,8 @@ And compare the histogram: >>> plt.show() Compare ML and MPS method ->>> phat = %(name)s.fit2(R, method='ml'); ->>> phat.plotfitsummary(); plt.figure(plt.gcf().number+1) +>>> phat = %(name)s.fit2(R, method='ml');>>> phat.plotfitsummary() +>>> plt.figure(plt.gcf().number+1) >>> phat2 = %(name)s.fit2(R, method='mps') >>> phat2.plotfitsummary(); plt.figure(plt.gcf().number+1) @@ -245,275 +66,6 @@ Accurate confidence interval with profile loglikelihood """ -_doc_default = ''.join([_doc_default_longsummary, - _doc_allmethods, - _doc_default_callparams, - _doc_default_frozen_note, - _doc_default_example]) - -_doc_default_before_notes = ''.join([_doc_default_longsummary, - _doc_allmethods, - _doc_default_callparams, - _doc_default_frozen_note]) - -docdict = { - 'rvs': _doc_rvs, - 'pdf': _doc_pdf, - 'logpdf': _doc_logpdf, - 'cdf': _doc_cdf, - 'logcdf': _doc_logcdf, - 'sf': _doc_sf, - 'logsf': _doc_logsf, - 'ppf': _doc_ppf, - 'isf': _doc_isf, - 'stats': _doc_stats, - 'entropy': _doc_entropy, - 'fit': _doc_fit, - 'moment': _doc_moment, - 'expect': _doc_expect, - 'interval': _doc_interval, - 'mean': _doc_mean, - 'std': _doc_std, - 'var': _doc_var, - 'median': _doc_median, - 'allmethods': _doc_allmethods, - 'callparams': _doc_default_callparams, - 'longsummary': _doc_default_longsummary, - 'frozennote': _doc_default_frozen_note, - 'example': _doc_default_example, - 'default': _doc_default, - 'before_notes': _doc_default_before_notes -} - -# Reuse common content between continuous and discrete docs, change some -# minor bits. -docdict_discrete = docdict.copy() - -docdict_discrete['pmf'] = _doc_pmf -docdict_discrete['logpmf'] = _doc_logpmf -docdict_discrete['expect'] = _doc_expect_discrete -_doc_disc_methods = ['rvs', 'pmf', 'logpmf', 'cdf', 'logcdf', 'sf', 'logsf', - 'ppf', 'isf', 'stats', 'entropy', 'expect', 'median', - 'mean', 'var', 'std', 'interval', - 'fit'] -for obj in _doc_disc_methods: - docdict_discrete[obj] = docdict_discrete[obj].replace(', scale=1', '') -docdict_discrete.pop('pdf') -docdict_discrete.pop('logpdf') - -_doc_allmethods = ''.join([docdict_discrete[obj] for obj in _doc_disc_methods]) -docdict_discrete['allmethods'] = docheaders['methods'] + _doc_allmethods - -docdict_discrete['longsummary'] = _doc_default_longsummary.replace( - 'Continuous', 'Discrete') -_doc_default_frozen_note = """ -Alternatively, the object may be called (as a function) to fix the shape and -location parameters returning a "frozen" discrete RV object: - -rv = %(name)s(%(shapes)s, loc=0) - - Frozen RV object with the same methods but holding the given shape and - location fixed. -""" -docdict_discrete['frozennote'] = _doc_default_frozen_note - -_doc_default_discrete_example = """\ -Examples --------- ->>> from wafo.stats import %(name)s ->>> import matplotlib.pyplot as plt ->>> fig, ax = plt.subplots(1, 1) - -Calculate a few first moments: - -%(set_vals_stmt)s ->>> mean, var, skew, kurt = %(name)s.stats(%(shapes)s, moments='mvsk') - -Display the probability mass function (``pmf``): - ->>> x = np.arange(%(name)s.ppf(0.01, %(shapes)s), -... %(name)s.ppf(0.99, %(shapes)s)) ->>> ax.plot(x, %(name)s.pmf(x, %(shapes)s), 'bo', ms=8, label='%(name)s pmf') ->>> ax.vlines(x, 0, %(name)s.pmf(x, %(shapes)s), colors='b', lw=5, alpha=0.5) - -Alternatively, freeze the distribution and display the frozen ``pmf``: - ->>> rv = %(name)s(%(shapes)s) ->>> ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-', lw=1, -... label='frozen pmf') ->>> ax.legend(loc='best', frameon=False) ->>> plt.show() - -Check accuracy of ``cdf`` and ``ppf``: - ->>> prob = %(name)s.cdf(x, %(shapes)s) ->>> np.allclose(x, %(name)s.ppf(prob, %(shapes)s)) -True - -Generate random numbers: - ->>> r = %(name)s.rvs(%(shapes)s, size=1000) -""" -docdict_discrete['example'] = _doc_default_discrete_example - -_doc_default_before_notes = ''.join([docdict_discrete['longsummary'], - docdict_discrete['allmethods'], - docdict_discrete['callparams'], - docdict_discrete['frozennote']]) -docdict_discrete['before_notes'] = _doc_default_before_notes - -_doc_default_disc = ''.join([docdict_discrete['longsummary'], - docdict_discrete['allmethods'], - docdict_discrete['frozennote'], - docdict_discrete['example']]) -docdict_discrete['default'] = _doc_default_disc - - -# clean up all the separate docstring elements, we do not need them anymore -for obj in [s for s in dir() if s.startswith('_doc_')]: - exec('del ' + obj) -del obj -try: - del s -except NameError: - # in Python 3, loop variables are not visible after the loop - pass - - -def _moment(data, n, mu=None): - if mu is None: - mu = data.mean() - return ((data - mu)**n).mean() - - -def _moment_from_stats(n, mu, mu2, g1, g2, moment_func, args): - if (n == 0): - return 1.0 - elif (n == 1): - if mu is None: - val = moment_func(1, *args) - else: - val = mu - elif (n == 2): - if mu2 is None or mu is None: - val = moment_func(2, *args) - else: - val = mu2 + mu*mu - elif (n == 3): - if g1 is None or mu2 is None or mu is None: - val = moment_func(3, *args) - else: - mu3 = g1 * np.power(mu2, 1.5) # 3rd central moment - val = mu3+3*mu*mu2+mu*mu*mu # 3rd non-central moment - elif (n == 4): - if g1 is None or g2 is None or mu2 is None or mu is None: - val = moment_func(4, *args) - else: - mu4 = (g2+3.0)*(mu2**2.0) # 4th central moment - mu3 = g1*np.power(mu2, 1.5) # 3rd central moment - val = mu4+4*mu*mu3+6*mu*mu*mu2+mu*mu*mu*mu - else: - val = moment_func(n, *args) - - return val - - -def _skew(data): - """ - skew is third central moment / variance**(1.5) - """ - data = np.ravel(data) - mu = data.mean() - m2 = ((data - mu)**2).mean() - m3 = ((data - mu)**3).mean() - return m3 / np.power(m2, 1.5) - - -def _kurtosis(data): - """ - kurtosis is fourth central moment / variance**2 - 3 - """ - data = np.ravel(data) - mu = data.mean() - m2 = ((data - mu)**2).mean() - m4 = ((data - mu)**4).mean() - return m4 / m2**2 - 3 - - -# Frozen RV class -class rv_frozen_old(object): - - def __init__(self, dist, *args, **kwds): - self.args = args - self.kwds = kwds - - # create a new instance - self.dist = dist.__class__(**dist._ctor_param) - - # a, b may be set in _argcheck, depending on *args, **kwds. Ouch. - shapes, _, _ = self.dist._parse_args(*args, **kwds) - self.dist._argcheck(*shapes) - - def pdf(self, x): # raises AttributeError in frozen discrete distribution - return self.dist.pdf(x, *self.args, **self.kwds) - - def logpdf(self, x): - return self.dist.logpdf(x, *self.args, **self.kwds) - - def cdf(self, x): - return self.dist.cdf(x, *self.args, **self.kwds) - - def logcdf(self, x): - return self.dist.logcdf(x, *self.args, **self.kwds) - - def ppf(self, q): - return self.dist.ppf(q, *self.args, **self.kwds) - - def isf(self, q): - return self.dist.isf(q, *self.args, **self.kwds) - - def rvs(self, size=None): - kwds = self.kwds.copy() - kwds.update({'size': size}) - return self.dist.rvs(*self.args, **kwds) - - def sf(self, x): - return self.dist.sf(x, *self.args, **self.kwds) - - def logsf(self, x): - return self.dist.logsf(x, *self.args, **self.kwds) - - def stats(self, moments='mv'): - kwds = self.kwds.copy() - kwds.update({'moments': moments}) - return self.dist.stats(*self.args, **kwds) - - def median(self): - return self.dist.median(*self.args, **self.kwds) - - def mean(self): - return self.dist.mean(*self.args, **self.kwds) - - def var(self): - return self.dist.var(*self.args, **self.kwds) - - def std(self): - return self.dist.std(*self.args, **self.kwds) - - def moment(self, n): - return self.dist.moment(n, *self.args, **self.kwds) - - def entropy(self): - return self.dist.entropy(*self.args, **self.kwds) - - def pmf(self, k): - return self.dist.pmf(k, *self.args, **self.kwds) - - def logpmf(self, k): - return self.dist.logpmf(k, *self.args, **self.kwds) - - def interval(self, alpha): - return self.dist.interval(alpha, *self.args, **self.kwds) - # Frozen RV class class rv_frozen(object): @@ -521,51 +73,52 @@ class rv_frozen(object): Methods ------- - RV.rvs(size=1) - - random variates - - RV.pdf(x) - - probability density function (continous case) - - RV.pmf(x) - - probability mass function (discrete case) - - RV.cdf(x) - - cumulative density function - - RV.sf(x) - - survival function (1-cdf --- sometimes more accurate) - - RV.ppf(q) - - percent point function (inverse of cdf --- percentiles) - - RV.isf(q) - - inverse survival function (inverse of sf) - - RV.stats(moments='mv') - - mean('m'), variance('v'), skew('s'), and/or kurtosis('k') - - RV.entropy() - - (differential) entropy of the RV. - - Parameters - ---------- - x : array-like - quantiles - q : array-like - lower or upper tail probability - size : int or tuple of ints, optional, keyword - shape of random variates - moments : string, optional, keyword - one or more of 'm' mean, 'v' variance, 's' skewness, 'k' kurtosis + rvs(size=1) + Random variates. + pdf(x) + Probability density function. + cdf(x) + Cumulative density function. + sf(x) + Survival function (1-cdf --- sometimes more accurate). + ppf(q) + Percent point function (inverse of cdf --- percentiles). + isf(q) + Inverse survival function (inverse of sf). + stats(moments='mv') + Mean('m'), variance('v'), skew('s'), and/or kurtosis('k'). + moment(n) + n-th order non-central moment of distribution. + entropy() + (Differential) entropy of the RV. + interval(alpha) + Confidence interval with equal areas around the median. + expect(func, lb, ub, conditional=False) + Calculate expected value of a function with respect to the + distribution. ''' def __init__(self, dist, *args, **kwds): - self.dist = dist - args, loc, scale = dist._parse_args(*args, **kwds) + # create a new instance + self.dist = dist # .__class__(**dist._ctor_param) + shapes, loc, scale = self.dist._parse_args(*args, **kwds) if isinstance(dist, rv_continuous): - self.par = args + (loc, scale) + self.par = shapes + (loc, scale) else: # rv_discrete - self.par = args + (loc,) + self.par = shapes + (loc,) + self.a = self.dist.a + self.b = self.dist.b + self.shapes = self.dist.shapes + # @property + # def shapes(self): + # return self.dist.shapes + + @property + def random_state(self): + return self.dist._random_state + + @random_state.setter + def random_state(self, seed): + self.dist._random_state = check_random_state(seed) def pdf(self, x): ''' Probability density function at x of the given RV.''' @@ -589,9 +142,8 @@ class rv_frozen(object): '''Inverse survival function at q of the given RV.''' return self.dist.isf(q, *self.par) - def rvs(self, size=None): - '''Random variates of given type.''' - kwds = dict(size=size) + def rvs(self, size=None, random_state=None): + kwds = {'size': size, 'random_state': random_state} return self.dist.rvs(*self.par, **kwds) def sf(self, x): @@ -634,2998 +186,391 @@ class rv_frozen(object): def interval(self, alpha): return self.dist.interval(alpha, *self.par) + def expect(self, func=None, lb=None, ub=None, conditional=False, **kwds): + if isinstance(self.dist, rv_continuous): + a, loc, scale = self.par[:-2], self.par[:-2], self.par[-1] + return self.dist.expect(func, a, loc, scale, lb, ub, conditional, + **kwds) + a, loc = self.par[:-1], self.par[-1] + if kwds: + raise ValueError("Discrete expect does not accept **kwds.") + return self.dist.expect(func, a, loc, lb, ub, conditional) -def valarray(shape, value=nan, typecode=None): - """Return an array of all value. - """ - - out = ones(shape, dtype=bool) * value - if typecode is not None: - out = out.astype(typecode) - if not isinstance(out, ndarray): - out = asarray(out) - return out - - -def _lazywhere(cond, arrays, f, fillvalue=None, f2=None): - """ - np.where(cond, x, fillvalue) always evaluates x even where cond is False. - This one only evaluates f(arr1[cond], arr2[cond], ...). - For example, - >>> a, b = np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]) - >>> def f(a, b): - return a*b - >>> _lazywhere(a > 2, (a, b), f, np.nan) - array([ nan, nan, 21., 32.]) - - Notice it assumes that all `arrays` are of the same shape, or can be - broadcasted together. - - """ - if fillvalue is None: - if f2 is None: - raise ValueError("One of (fillvalue, f2) must be given.") - else: - fillvalue = np.nan - else: - if f2 is not None: - raise ValueError("Only one of (fillvalue, f2) can be given.") - - arrays = np.broadcast_arrays(*arrays) - temp = tuple(np.extract(cond, arr) for arr in arrays) - out = valarray(shape(arrays[0]), value=fillvalue) - np.place(out, cond, f(*temp)) - if f2 is not None: - temp = tuple(np.extract(~cond, arr) for arr in arrays) - np.place(out, ~cond, f2(*temp)) - - return out - - -# This should be rewritten -def argsreduce(cond, *args): - """Return the sequence of ravel(args[i]) where ravel(condition) is - True in 1D. - - Examples - -------- - >>> import numpy as np - >>> rand = np.random.random_sample - >>> A = rand((4, 5)) - >>> B = 2 - >>> C = rand((1, 5)) - >>> cond = np.ones(A.shape) - >>> [A1, B1, C1] = argsreduce(cond, A, B, C) - >>> B1.shape - (20,) - >>> cond[2,:] = 0 - >>> [A2, B2, C2] = argsreduce(cond, A, B, C) - >>> B2.shape - (15,) - - """ - newargs = np.atleast_1d(*args) - if not isinstance(newargs, list): - newargs = [newargs, ] - expand_arr = (cond == cond) - return [np.extract(cond, arr1 * expand_arr) for arr1 in newargs] - - -parse_arg_template = """ -def _parse_args(self, %(shape_arg_str)s %(locscale_in)s): - return (%(shape_arg_str)s), %(locscale_out)s - -def _parse_args_rvs(self, %(shape_arg_str)s %(locscale_in)s, size=None): - return (%(shape_arg_str)s), %(locscale_out)s, size - -def _parse_args_stats(self, %(shape_arg_str)s %(locscale_in)s, moments='mv'): - return (%(shape_arg_str)s), %(locscale_out)s, moments -""" - - -# Both the continuous and discrete distributions depend on ncx2. -# I think the function name ncx2 is an abbreviation for noncentral chi squared. -def _ncx2_log_pdf(x, df, nc): - a = asarray(df/2.0) - fac = -nc/2.0 - x/2.0 + (a-1)*log(x) - a*log(2) - gammaln(a) - return fac + np.nan_to_num(log(hyp0f1(a, nc * x/4.0))) - - -def _ncx2_pdf(x, df, nc): - return np.exp(_ncx2_log_pdf(x, df, nc)) - - -def _ncx2_cdf(x, df, nc): - return chndtr(x, df, nc) +def freeze(self, *args, **kwds): + """Freeze the distribution for the given arguments. + Parameters + ---------- + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution. Should include all + the non-optional arguments, may include ``loc`` and ``scale``. -class rv_generic(object): - """Class which encapsulates common functionality between rv_discrete - and rv_continuous. + Returns + ------- + rv_frozen : rv_frozen instance + The frozen distribution. """ - def __init__(self): - super(rv_generic, self).__init__() - - # figure out if _stats signature has 'moments' keyword - sign = inspect.getargspec(self._stats) - self._stats_has_moments = ((sign[2] is not None) or - ('moments' in sign[0])) - - def _construct_argparser( - self, meths_to_inspect, locscale_in, locscale_out): - """Construct the parser for the shape arguments. - - Generates the argument-parsing functions dynamically and attaches - them to the instance. - Is supposed to be called in __init__ of a class for each distribution. - - If self.shapes is a non-empty string, interprets it as a - comma-separated list of shape parameters. - - Otherwise inspects the call signatures of `meths_to_inspect` - and constructs the argument-parsing functions from these. - In this case also sets `shapes` and `numargs`. - """ - - if self.shapes: - # sanitize the user-supplied shapes - if not isinstance(self.shapes, string_types): - raise TypeError('shapes must be a string.') - - shapes = self.shapes.replace(',', ' ').split() - - for field in shapes: - if keyword.iskeyword(field): - raise SyntaxError('keywords cannot be used as shapes.') - if not re.match('^[_a-zA-Z][_a-zA-Z0-9]*$', field): - raise SyntaxError( - 'shapes must be valid python identifiers') - else: - # find out the call signatures (_pdf, _cdf etc), deduce shape - # arguments - shapes_list = [] - for meth in meths_to_inspect: - shapes_args = inspect.getargspec(meth) - shapes_list.append(shapes_args.args) - - # *args or **kwargs are not allowed w/automatic shapes - # (generic methods have 'self, x' only) - if len(shapes_args.args) > 2: - if shapes_args.varargs is not None: - raise TypeError( - '*args are not allowed w/out explicit shapes') - if shapes_args.keywords is not None: - raise TypeError( - '**kwds are not allowed w/out explicit shapes') - if shapes_args.defaults is not None: - raise TypeError('defaults are not allowed for shapes') - - shapes = max(shapes_list, key=lambda x: len(x)) - shapes = shapes[2:] # remove self, x, - - # make sure the signatures are consistent - # (generic methods have 'self, x' only) - for item in shapes_list: - if len(item) > 2 and item[2:] != shapes: - raise TypeError('Shape arguments are inconsistent.') - - # have the arguments, construct the method from template - shapes_str = ', '.join(shapes) + ', ' if shapes else '' # NB: not None - dct = dict(shape_arg_str=shapes_str, - locscale_in=locscale_in, - locscale_out=locscale_out, - ) - ns = {} - exec_(parse_arg_template % dct, ns) - # NB: attach to the instance, not class - for name in ['_parse_args', '_parse_args_stats', '_parse_args_rvs']: - setattr(self, name, - instancemethod(ns[name], self, self.__class__) - ) - - self.shapes = ', '.join(shapes) if shapes else None - if not hasattr(self, 'numargs'): - # allows more general subclassing with *args - self.numargs = len(shapes) - - def _construct_doc(self, docdict, shapes_vals=None): - """Construct the instance docstring with string substitutions.""" - tempdict = docdict.copy() - tempdict['name'] = self.name or 'distname' - tempdict['shapes'] = self.shapes or '' - - if shapes_vals is None: - shapes_vals = () - vals = ', '.join(str(_) for _ in shapes_vals) - tempdict['vals'] = vals - - if self.shapes: - tempdict['set_vals_stmt'] = '>>> %s = %s' % (self.shapes, vals) - else: - tempdict['set_vals_stmt'] = '' - - if self.shapes is None: - # remove shapes from call parameters if there are none - for item in ['callparams', 'default', 'before_notes']: - tempdict[item] = tempdict[item].replace( - "\n%(shapes)s : array_like\n shape parameters", "") - for i in range(2): - if self.shapes is None: - # necessary because we use %(shapes)s in two forms (w w/o ", ") - self.__doc__ = self.__doc__.replace("%(shapes)s, ", "") - self.__doc__ = doccer.docformat(self.__doc__, tempdict) - - # correct for empty shapes - self.__doc__ = self.__doc__.replace('(, ', '(').replace(', )', ')') - - def freeze(self, *args, **kwds): - """Freeze the distribution for the given arguments. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution. Should include all - the non-optional arguments, may include ``loc`` and ``scale``. - - Returns - ------- - rv_frozen : rv_frozen instance - The frozen distribution. - - """ - return rv_frozen(self, *args, **kwds) - - def __call__(self, *args, **kwds): - return self.freeze(*args, **kwds) - - # The actual calculation functions (no basic checking need be done) - # If these are defined, the others won't be looked at. - # Otherwise, the other set can be defined. - def _stats(self, *args, **kwds): - return None, None, None, None - - # Central moments - def _munp(self, n, *args): - # Silence floating point warnings from integration. - olderr = np.seterr(all='ignore') - vals = self.generic_moment(n, *args) - np.seterr(**olderr) - return vals - - ## These are the methods you must define (standard form functions) - ## NB: generic _pdf, _logpdf, _cdf are different for - ## rv_continuous and rv_discrete hence are defined in there - def _argcheck(self, *args): - """Default check for correct values on args and keywords. - - Returns condition array of 1's where arguments are correct and - 0's where they are not. - - """ - cond = 1 - for arg in args: - cond = logical_and(cond, (asarray(arg) > 0)) - return cond - - ##(return 1-d using self._size to get number) - def _rvs(self, *args): - ## Use basic inverse cdf algorithm for RV generation as default. - U = mtrand.sample(self._size) - Y = self._ppf(U, *args) - return Y - - def _logcdf(self, x, *args): - return log(self._cdf(x, *args)) - - def _sf(self, x, *args): - return 1.0-self._cdf(x, *args) - - def _logsf(self, x, *args): - return log(self._sf(x, *args)) - - def _ppf(self, q, *args): - return self._ppfvec(q, *args) - - def _isf(self, q, *args): - return self._ppf(1.0-q, *args) # use correct _ppf for subclasses - - # These are actually called, and should not be overwritten if you - # want to keep error checking. - def rvs(self, *args, **kwds): - """ - Random variates of given type. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - scale : array_like, optional - Scale parameter (default=1). - size : int or tuple of ints, optional - Defining number of random variates (default=1). - - Returns - ------- - rvs : ndarray or scalar - Random variates of given `size`. - - """ - discrete = kwds.pop('discrete', None) - args, loc, scale, size = self._parse_args_rvs(*args, **kwds) - cond = logical_and(self._argcheck(*args), (scale >= 0)) - if not np.all(cond): - raise ValueError("Domain error in arguments.") - - # self._size is total size of all output values - self._size = product(size, axis=0) - if self._size is not None and self._size > 1: - size = np.array(size, ndmin=1) - - if np.all(scale == 0): - return loc*ones(size, 'd') - - vals = self._rvs(*args) - if self._size is not None: - vals = reshape(vals, size) - - vals = vals * scale + loc - - # Cast to int if discrete - if discrete: - if np.isscalar(vals): - vals = int(vals) - else: - vals = vals.astype(int) - - return vals - - def stats(self, *args, **kwds): - """ - Some statistics of the given RV - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional (discrete RVs only) - scale parameter (default=1) - moments : str, optional - composed of letters ['mvsk'] defining which moments to compute: - 'm' = mean, - 'v' = variance, - 's' = (Fisher's) skew, - 'k' = (Fisher's) kurtosis. - (default='mv') - - Returns - ------- - stats : sequence - of requested moments. - - """ - args, loc, scale, moments = self._parse_args_stats(*args, **kwds) - # scale = 1 by construction for discrete RVs - loc, scale = map(asarray, (loc, scale)) - args = tuple(map(asarray, args)) - cond = self._argcheck(*args) & (scale > 0) & (loc == loc) - output = [] - default = valarray(shape(cond), self.badvalue) - - # Use only entries that are valid in calculation - if any(cond): - goodargs = argsreduce(cond, *(args+(scale, loc))) - scale, loc, goodargs = goodargs[-2], goodargs[-1], goodargs[:-2] - - if self._stats_has_moments: - mu, mu2, g1, g2 = self._stats(*goodargs, - **{'moments': moments}) - else: - mu, mu2, g1, g2 = self._stats(*goodargs) - if g1 is None: - mu3 = None - else: - if mu2 is None: - mu2 = self._munp(2, *goodargs) - # (mu2**1.5) breaks down for nan and inf - mu3 = g1 * np.power(mu2, 1.5) - - if 'm' in moments: - if mu is None: - mu = self._munp(1, *goodargs) - out0 = default.copy() - place(out0, cond, mu * scale + loc) - output.append(out0) - - if 'v' in moments: - if mu2 is None: - mu2p = self._munp(2, *goodargs) - if mu is None: - mu = self._munp(1, *goodargs) - mu2 = mu2p - mu * mu - if np.isinf(mu): - #if mean is inf then var is also inf - mu2 = np.inf - out0 = default.copy() - place(out0, cond, mu2 * scale * scale) - output.append(out0) - - if 's' in moments: - if g1 is None: - mu3p = self._munp(3, *goodargs) - if mu is None: - mu = self._munp(1, *goodargs) - if mu2 is None: - mu2p = self._munp(2, *goodargs) - mu2 = mu2p - mu * mu - mu3 = mu3p - 3 * mu * mu2 - mu**3 - g1 = mu3 / np.power(mu2, 1.5) - out0 = default.copy() - place(out0, cond, g1) - output.append(out0) - - if 'k' in moments: - if g2 is None: - mu4p = self._munp(4, *goodargs) - if mu is None: - mu = self._munp(1, *goodargs) - if mu2 is None: - mu2p = self._munp(2, *goodargs) - mu2 = mu2p - mu * mu - if mu3 is None: - mu3p = self._munp(3, *goodargs) - mu3 = mu3p - 3 * mu * mu2 - mu**3 - mu4 = mu4p - 4 * mu * mu3 - 6 * mu * mu * mu2 - mu**4 - g2 = mu4 / mu2**2.0 - 3.0 - out0 = default.copy() - place(out0, cond, g2) - output.append(out0) - else: # no valid args - output = [] - for _ in moments: - out0 = default.copy() - output.append(out0) - - if len(output) == 1: - return output[0] - else: - return tuple(output) - - def entropy(self, *args, **kwds): - """ - Differential entropy of the RV. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - scale : array_like, optional (continuous distributions only). - Scale parameter (default=1). - - Notes - ----- - Entropy is defined base `e`: - - >>> drv = rv_discrete(values=((0, 1), (0.5, 0.5))) - >>> np.allclose(drv.entropy(), np.log(2.0)) - True - - """ - args, loc, scale = self._parse_args(*args, **kwds) - # NB: for discrete distributions scale=1 by construction in _parse_args - args = tuple(map(asarray, args)) - cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc) - output = zeros(shape(cond0), 'd') - place(output, (1-cond0), self.badvalue) - goodargs = argsreduce(cond0, *args) - # I don't know when or why vecentropy got broken when numargs == 0 - # 09.08.2013: is this still relevant? cf check_vecentropy test - # in tests/test_continuous_basic.py - if self.numargs == 0: - place(output, cond0, self._entropy() + log(scale)) - else: - place(output, cond0, self.vecentropy(*goodargs) + log(scale)) - return output - - def moment(self, n, *args, **kwds): - """ - n'th order non-central moment of distribution. - - Parameters - ---------- - n : int, n>=1 - Order of moment. - arg1, arg2, arg3,... : float - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - kwds : keyword arguments, optional - These can include "loc" and "scale", as well as other keyword - arguments relevant for a given distribution. - - """ - args, loc, scale = self._parse_args(*args, **kwds) - if not (self._argcheck(*args) and (scale > 0)): - return nan - if (floor(n) != n): - raise ValueError("Moment must be an integer.") - if (n < 0): - raise ValueError("Moment must be positive.") - mu, mu2, g1, g2 = None, None, None, None - if (n > 0) and (n < 5): - if self._stats_has_moments: - mdict = {'moments': {1: 'm', 2: 'v', 3: 'vs', 4: 'vk'}[n]} - else: - mdict = {} - mu, mu2, g1, g2 = self._stats(*args, **mdict) - val = _moment_from_stats(n, mu, mu2, g1, g2, self._munp, args) - - # Convert to transformed X = L + S*Y - # E[X^n] = E[(L+S*Y)^n] = L^n sum(comb(n, k)*(S/L)^k E[Y^k], k=0...n) - if loc == 0: - return scale**n * val - else: - result = 0 - fac = float(scale) / float(loc) - for k in range(n): - valk = _moment_from_stats(k, mu, mu2, g1, g2, self._munp, args) - result += comb(n, k, exact=True)*(fac**k) * valk - result += fac**n * val - return result * loc**n - - def median(self, *args, **kwds): - """ - Median of the distribution. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - Location parameter, Default is 0. - scale : array_like, optional - Scale parameter, Default is 1. - - Returns - ------- - median : float - The median of the distribution. - - See Also - -------- - stats.distributions.rv_discrete.ppf - Inverse of the CDF - - """ - return self.ppf(0.5, *args, **kwds) - - def mean(self, *args, **kwds): - """ - Mean of the distribution - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - mean : float - the mean of the distribution - """ - kwds['moments'] = 'm' - res = self.stats(*args, **kwds) - if isinstance(res, ndarray) and res.ndim == 0: - return res[()] - return res - - def var(self, *args, **kwds): - """ - Variance of the distribution - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - var : float - the variance of the distribution - - """ - kwds['moments'] = 'v' - res = self.stats(*args, **kwds) - if isinstance(res, ndarray) and res.ndim == 0: - return res[()] - return res - - def std(self, *args, **kwds): - """ - Standard deviation of the distribution. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - std : float - standard deviation of the distribution - - """ - kwds['moments'] = 'v' - res = sqrt(self.stats(*args, **kwds)) - return res - - def interval(self, alpha, *args, **kwds): - """ - Confidence interval with equal areas around the median. + return rv_frozen(self, *args, **kwds) - Parameters - ---------- - alpha : array_like of float - Probability that an rv will be drawn from the returned range. - Each value should be in the range [0, 1]. - arg1, arg2, ... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - location parameter, Default is 0. - scale : array_like, optional - scale parameter, Default is 1. - - Returns - ------- - a, b : ndarray of float - end-points of range that contain ``100 * alpha %`` of the rv's - possible values. - - """ - alpha = asarray(alpha) - if any((alpha > 1) | (alpha < 0)): - raise ValueError("alpha must be between 0 and 1 inclusive") - q1 = (1.0-alpha)/2 - q2 = (1.0+alpha)/2 - a = self.ppf(q1, *args, **kwds) - b = self.ppf(q2, *args, **kwds) - return a, b - - -## continuous random variables: implement maybe later -## -## hf --- Hazard Function (PDF / SF) -## chf --- Cumulative hazard function (-log(SF)) -## psf --- Probability sparsity function (reciprocal of the pdf) in -## units of percent-point-function (as a function of q). -## Also, the derivative of the percent-point function. - -class rv_continuous(rv_generic): - """ - A generic continuous random variable class meant for subclassing. - `rv_continuous` is a base class to construct specific distribution classes - and instances from for continuous random variables. It cannot be used - directly as a distribution. +def link(self, x, logSF, theta, i): + ''' + Return theta[i] as function of quantile, survival probability and + theta[j] for j!=i. Parameters ---------- - momtype : int, optional - The type of generic moment calculation to use: 0 for pdf, 1 (default) - for ppf. - a : float, optional - Lower bound of the support of the distribution, default is minus - infinity. - b : float, optional - Upper bound of the support of the distribution, default is plus - infinity. - xtol : float, optional - The tolerance for fixed point calculation for generic ppf. - badvalue : object, optional - The value in a result arrays that indicates a value that for which - some argument restriction is violated, default is np.nan. - name : str, optional - The name of the instance. This string is used to construct the default - example for distributions. - longname : str, optional - This string is used as part of the first line of the docstring returned - when a subclass has no docstring of its own. Note: `longname` exists - for backwards compatibility, do not use for new subclasses. - shapes : str, optional - The shape of the distribution. For example ``"m, n"`` for a - distribution that takes two integers as the two shape arguments for all - its methods. - extradoc : str, optional, deprecated - This string is used as the last part of the docstring returned when a - subclass has no docstring of its own. Note: `extradoc` exists for - backwards compatibility, do not use for new subclasses. + x : quantile + logSF : logarithm of the survival probability + theta : list + all distribution parameters including location and scale. - Methods + Returns ------- - ``rvs(, loc=0, scale=1, size=1)`` - random variates - - ``pdf(x, , loc=0, scale=1)`` - probability density function - - ``logpdf(x, , loc=0, scale=1)`` - log of the probability density function - - ``cdf(x, , loc=0, scale=1)`` - cumulative density function - - ``logcdf(x, , loc=0, scale=1)`` - log of the cumulative density function - - ``sf(x, , loc=0, scale=1)`` - survival function (1-cdf --- sometimes more accurate) - - ``logsf(x, , loc=0, scale=1)`` - log of the survival function - - ``ppf(q, , loc=0, scale=1)`` - percent point function (inverse of cdf --- quantiles) - - ``isf(q, , loc=0, scale=1)`` - inverse survival function (inverse of sf) - - ``moment(n, , loc=0, scale=1)`` - non-central n-th moment of the distribution. May not work for array - arguments. - - ``stats(, loc=0, scale=1, moments='mv')`` - mean('m'), variance('v'), skew('s'), and/or kurtosis('k') - - ``entropy(, loc=0, scale=1)`` - (differential) entropy of the RV. - - ``fit(data, , loc=0, scale=1)`` - Parameter estimates for generic data - - ``expect(func=None, args=(), loc=0, scale=1, lb=None, ub=None, conditional=False, **kwds)`` - Expected value of a function with respect to the distribution. - Additional kwd arguments passed to integrate.quad - - ``median(, loc=0, scale=1)`` - Median of the distribution. - - ``mean(, loc=0, scale=1)`` - Mean of the distribution. - - ``std(, loc=0, scale=1)`` - Standard deviation of the distribution. - - ``var(, loc=0, scale=1)`` - Variance of the distribution. - - ``interval(alpha, , loc=0, scale=1)`` - Interval that with `alpha` percent probability contains a random - realization of this distribution. - - ``__call__(, loc=0, scale=1)`` - Calling a distribution instance creates a frozen RV object with the - same methods but holding the given shape, location, and scale fixed. - See Notes section. - - **Parameters for Methods** - - x : array_like - quantiles - q : array_like - lower or upper tail probability - : array_like - shape parameters - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - size : int or tuple of ints, optional - shape of random variates (default computed from input arguments ) - moments : string, optional - composed of letters ['mvsk'] specifying which moments to compute where - 'm' = mean, 'v' = variance, 's' = (Fisher's) skew and - 'k' = (Fisher's) kurtosis. (default='mv') - n : int - order of moment to calculate in method moments - - Notes - ----- - - **Methods that can be overwritten by subclasses** - :: - - _rvs - _pdf - _cdf - _sf - _ppf - _isf - _stats - _munp - _entropy - _argcheck - - There are additional (internal and private) generic methods that can - be useful for cross-checking and for debugging, but might work in all - cases when directly called. - - **Frozen Distribution** - - Alternatively, the object may be called (as a function) to fix the shape, - location, and scale parameters returning a "frozen" continuous RV object: - - rv = generic(, loc=0, scale=1) - frozen RV object with the same methods but holding the given shape, - location, and scale fixed - - **Subclassing** + theta[i] : real scalar + fixed distribution parameter theta[i] as function of x, logSF and + theta[j] where j != i. + + LINK is a function connecting the fixed distribution parameter theta[i] + with the quantile (x) and the survival probability (SF) and the + remaining free distribution parameters theta[j] for j!=i, i.e.: + theta[i] = link(x, logSF, theta, i), + where logSF = log(Prob(X>x; theta)). + + See also + estimation.Profile + ''' + return self._link(x, logSF, theta, i) - New random variables can be defined by subclassing rv_continuous class - and re-defining at least the ``_pdf`` or the ``_cdf`` method (normalized - to location 0 and scale 1) which will be given clean arguments (in between - a and b) and passing the argument check method. - If positive argument checking is not correct for your RV - then you will also need to re-define the ``_argcheck`` method. +def _link(self, x, logSF, theta, i): + msg = ('Link function not implemented for the %s distribution' % + self.name) + raise NotImplementedError(msg) - Correct, but potentially slow defaults exist for the remaining - methods but for speed and/or accuracy you can over-ride:: - _logpdf, _cdf, _logcdf, _ppf, _rvs, _isf, _sf, _logsf +def nlogps(self, theta, x): + """ Moran's negative log Product Spacings statistic - Rarely would you override ``_isf``, ``_sf`` or ``_logsf``, but you could. + where theta are the parameters (including loc and scale) - Statistics are computed using numerical integration by default. - For speed you can redefine this using ``_stats``: + Note the data in x must be sorted - - take shape parameters and return mu, mu2, g1, g2 - - If you can't compute one of these, return it as None - - Can also be defined with a keyword argument ``moments=``, - where is a string composed of 'm', 'v', 's', - and/or 'k'. Only the components appearing in string - should be computed and returned in the order 'm', 'v', - 's', or 'k' with missing values returned as None. + References + ----------- - Alternatively, you can override ``_munp``, which takes n and shape - parameters and returns the nth non-central moment of the distribution. + R. C. H. Cheng; N. A. K. Amin (1983) + "Estimating Parameters in Continuous Univariate Distributions with a + Shifted Origin.", + Journal of the Royal Statistical Society. Series B (Methodological), + Vol. 45, No. 3. (1983), pp. 394-403. - A note on ``shapes``: subclasses need not specify them explicitly. In this - case, the `shapes` will be automatically deduced from the signatures of the - overridden methods. - If, for some reason, you prefer to avoid relying on introspection, you can - specify ``shapes`` explicitly as an argument to the instance constructor. - - Examples - -------- - To create a new Gaussian distribution, we would do the following:: - - class gaussian_gen(rv_continuous): - "Gaussian distribution" - def _pdf(self, x): - ... - ... + R. C. H. Cheng; M. A. Stephens (1989) + "A Goodness-Of-Fit Test Using Moran's Statistic with Estimated + Parameters", Biometrika, 76, 2, pp 385-392 + Wong, T.S.T. and Li, W.K. (2006) + "A note on the estimation of extreme value distributions using maximum + product of spacings.", + IMS Lecture Notes Monograph Series 2006, Vol. 52, pp. 272-283 """ - def __init__(self, momtype=1, a=None, b=None, xtol=1e-14, - badvalue=None, name=None, longname=None, - shapes=None, extradoc=None): - - super(rv_continuous, self).__init__() - - # save the ctor parameters, cf generic freeze - self._ctor_param = dict( - momtype=momtype, a=a, b=b, xtol=xtol, - badvalue=badvalue, name=name, longname=longname, - shapes=shapes, extradoc=extradoc) - - if badvalue is None: - badvalue = nan - if name is None: - name = 'Distribution' - self.badvalue = badvalue - self.name = name - self.a = a - self.b = b - if a is None: - self.a = -inf - if b is None: - self.b = inf - self.xtol = xtol - self._size = 1 - self.moment_type = momtype - self.shapes = shapes - self._construct_argparser(meths_to_inspect=[self._pdf, self._cdf], - locscale_in='loc=0, scale=1', - locscale_out='loc, scale') - - # nin correction - self._ppfvec = vectorize(self._ppf_single, otypes='d') - self._ppfvec.nin = self.numargs + 1 - self.vecentropy = vectorize(self._entropy, otypes='d') - self._cdfvec = vectorize(self._cdf_single, otypes='d') - self._cdfvec.nin = self.numargs + 1 - - # backwards compat. these were removed in 0.14.0, put back but - # deprecated in 0.14.1: - self.vecfunc = np.deprecate(self._ppfvec, "vecfunc") - self.veccdf = np.deprecate(self._cdfvec, "veccdf") - - self.extradoc = extradoc - if momtype == 0: - self.generic_moment = vectorize(self._mom0_sc, otypes='d') - else: - self.generic_moment = vectorize(self._mom1_sc, otypes='d') - # Because of the *args argument of _mom0_sc, vectorize cannot count the - # number of arguments correctly. - self.generic_moment.nin = self.numargs + 1 - - if longname is None: - if name[0] in ['aeiouAEIOU']: - hstr = "An " - else: - hstr = "A " - longname = hstr + name - - if sys.flags.optimize < 2: - # Skip adding docstrings if interpreter is run with -OO - if self.__doc__ is None: - self._construct_default_doc(longname=longname, - extradoc=extradoc) - else: - dct = dict(distcont) - self._construct_doc(docdict, dct.get(self.name)) - - def _construct_default_doc(self, longname=None, extradoc=None): - """Construct instance docstring from the default template.""" - if longname is None: - longname = 'A' - if extradoc is None: - extradoc = '' - if extradoc.startswith('\n\n'): - extradoc = extradoc[2:] - self.__doc__ = ''.join(['%s continuous random variable.' % longname, - '\n\n%(before_notes)s\n', docheaders['notes'], - extradoc, '\n%(example)s']) - self._construct_doc(docdict) - - def _ppf_to_solve(self, x, q, *args): - return self.cdf(*(x, )+args)-q - - def _ppf_single(self, q, *args): - left = right = None - if self.a > -np.inf: - left = self.a - if self.b < np.inf: - right = self.b - - factor = 10. - if not left: # i.e. self.a = -inf - left = -1.*factor - while self._ppf_to_solve(left, q, *args) > 0.: - right = left - left *= factor - # left is now such that cdf(left) < q - if not right: # i.e. self.b = inf - right = factor - while self._ppf_to_solve(right, q, *args) < 0.: - left = right - right *= factor - # right is now such that cdf(right) > q - - return optimize.brentq(self._ppf_to_solve, - left, right, args=(q,)+args, xtol=self.xtol) - - # moment from definition - def _mom_integ0(self, x, m, *args): - return x**m * self.pdf(x, *args) - - def _mom0_sc(self, m, *args): - return integrate.quad(self._mom_integ0, self.a, self.b, - args=(m,)+args)[0] - - # moment calculated using ppf - def _mom_integ1(self, q, m, *args): - return (self.ppf(q, *args))**m - - def _mom1_sc(self, m, *args): - return integrate.quad(self._mom_integ1, 0, 1, args=(m,)+args)[0] - - def _pdf(self, x, *args): - return derivative(self._cdf, x, dx=1e-5, args=args, order=5) - - ## Could also define any of these - def _logpdf(self, x, *args): - return log(self._pdf(x, *args)) - - def _cdf_single(self, x, *args): - return integrate.quad(self._pdf, self.a, x, args=args)[0] - - def _cdf(self, x, *args): - return self._cdfvec(x, *args) - - ## generic _argcheck, _logcdf, _sf, _logsf, _ppf, _isf, _rvs are defined - ## in rv_generic - - def pdf(self, x, *args, **kwds): - """ - Probability density function at x of the given RV. - - Parameters - ---------- - x : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - pdf : ndarray - Probability density function evaluated at x - - """ - args, loc, scale = self._parse_args(*args, **kwds) - x, loc, scale = map(asarray, (x, loc, scale)) - args = tuple(map(asarray, args)) - x = asarray((x-loc)*1.0/scale) - cond0 = self._argcheck(*args) & (scale > 0) - cond1 = (scale > 0) & (x >= self.a) & (x <= self.b) - cond = cond0 & cond1 - output = zeros(shape(cond), 'd') - putmask(output, (1-cond0)+np.isnan(x), self.badvalue) - if any(cond): - goodargs = argsreduce(cond, *((x,)+args+(scale,))) - scale, goodargs = goodargs[-1], goodargs[:-1] - place(output, cond, self._pdf(*goodargs) / scale) - if output.ndim == 0: - return output[()] - return output - - def logpdf(self, x, *args, **kwds): - """ - Log of the probability density function at x of the given RV. - - This uses a more numerically accurate calculation if available. - - Parameters - ---------- - x : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - logpdf : array_like - Log of the probability density function evaluated at x - - """ - args, loc, scale = self._parse_args(*args, **kwds) - x, loc, scale = map(asarray, (x, loc, scale)) - args = tuple(map(asarray, args)) - x = asarray((x-loc)*1.0/scale) - cond0 = self._argcheck(*args) & (scale > 0) - cond1 = (scale > 0) & (x >= self.a) & (x <= self.b) - cond = cond0 & cond1 - output = empty(shape(cond), 'd') - output.fill(NINF) - putmask(output, (1-cond0)+np.isnan(x), self.badvalue) - if any(cond): - goodargs = argsreduce(cond, *((x,)+args+(scale,))) - scale, goodargs = goodargs[-1], goodargs[:-1] - place(output, cond, self._logpdf(*goodargs) - log(scale)) - if output.ndim == 0: - return output[()] - return output - - def cdf(self, x, *args, **kwds): - """ - Cumulative distribution function of the given RV. - - Parameters - ---------- - x : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - cdf : ndarray - Cumulative distribution function evaluated at `x` - - """ - args, loc, scale = self._parse_args(*args, **kwds) - x, loc, scale = map(asarray, (x, loc, scale)) - args = tuple(map(asarray, args)) - x = (x-loc)*1.0/scale - cond0 = self._argcheck(*args) & (scale > 0) - cond1 = (scale > 0) & (x > self.a) & (x < self.b) - cond2 = (x >= self.b) & cond0 - cond = cond0 & cond1 - output = zeros(shape(cond), 'd') - place(output, (1-cond0)+np.isnan(x), self.badvalue) - place(output, cond2, 1.0) - if any(cond): # call only if at least 1 entry - goodargs = argsreduce(cond, *((x,)+args)) - place(output, cond, self._cdf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def logcdf(self, x, *args, **kwds): - """ - Log of the cumulative distribution function at x of the given RV. - - Parameters - ---------- - x : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - logcdf : array_like - Log of the cumulative distribution function evaluated at x - - """ - args, loc, scale = self._parse_args(*args, **kwds) - x, loc, scale = map(asarray, (x, loc, scale)) - args = tuple(map(asarray, args)) - x = (x-loc)*1.0/scale - cond0 = self._argcheck(*args) & (scale > 0) - cond1 = (scale > 0) & (x > self.a) & (x < self.b) - cond2 = (x >= self.b) & cond0 - cond = cond0 & cond1 - output = empty(shape(cond), 'd') - output.fill(NINF) - place(output, (1-cond0)*(cond1 == cond1)+np.isnan(x), self.badvalue) - place(output, cond2, 0.0) - if any(cond): # call only if at least 1 entry - goodargs = argsreduce(cond, *((x,)+args)) - place(output, cond, self._logcdf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def sf(self, x, *args, **kwds): - """ - Survival function (1-cdf) at x of the given RV. - - Parameters - ---------- - x : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - sf : array_like - Survival function evaluated at x - - """ - args, loc, scale = self._parse_args(*args, **kwds) - x, loc, scale = map(asarray, (x, loc, scale)) - args = tuple(map(asarray, args)) - x = (x-loc)*1.0/scale - cond0 = self._argcheck(*args) & (scale > 0) - cond1 = (scale > 0) & (x > self.a) & (x < self.b) - cond2 = cond0 & (x <= self.a) - cond = cond0 & cond1 - output = zeros(shape(cond), 'd') - place(output, (1-cond0)+np.isnan(x), self.badvalue) - place(output, cond2, 1.0) - if any(cond): - goodargs = argsreduce(cond, *((x,)+args)) - place(output, cond, self._sf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def logsf(self, x, *args, **kwds): - """ - Log of the survival function of the given RV. - - Returns the log of the "survival function," defined as (1 - `cdf`), - evaluated at `x`. - - Parameters - ---------- - x : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - logsf : ndarray - Log of the survival function evaluated at `x`. - - """ - args, loc, scale = self._parse_args(*args, **kwds) - x, loc, scale = map(asarray, (x, loc, scale)) - args = tuple(map(asarray, args)) - x = (x-loc)*1.0/scale - cond0 = self._argcheck(*args) & (scale > 0) - cond1 = (scale > 0) & (x > self.a) & (x < self.b) - cond2 = cond0 & (x <= self.a) - cond = cond0 & cond1 - output = empty(shape(cond), 'd') - output.fill(NINF) - place(output, (1-cond0)+np.isnan(x), self.badvalue) - place(output, cond2, 0.0) - if any(cond): - goodargs = argsreduce(cond, *((x,)+args)) - place(output, cond, self._logsf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def ppf(self, q, *args, **kwds): - """ - Percent point function (inverse of cdf) at q of the given RV. - - Parameters - ---------- - q : array_like - lower tail probability - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - x : array_like - quantile corresponding to the lower tail probability q. - - """ - args, loc, scale = self._parse_args(*args, **kwds) - q, loc, scale = map(asarray, (q, loc, scale)) - args = tuple(map(asarray, args)) - cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc) - cond1 = (0 < q) & (q < 1) - cond2 = cond0 & (q == 0) - cond3 = cond0 & (q == 1) - cond = cond0 & cond1 - output = valarray(shape(cond), value=self.badvalue) - - lower_bound = self.a * scale + loc - upper_bound = self.b * scale + loc - place(output, cond2, argsreduce(cond2, lower_bound)[0]) - place(output, cond3, argsreduce(cond3, upper_bound)[0]) - - if any(cond): # call only if at least 1 entry - goodargs = argsreduce(cond, *((q,)+args+(scale, loc))) - scale, loc, goodargs = goodargs[-2], goodargs[-1], goodargs[:-2] - place(output, cond, self._ppf(*goodargs) * scale + loc) - if output.ndim == 0: - return output[()] - return output - - def isf(self, q, *args, **kwds): - """ - Inverse survival function at q of the given RV. - - Parameters - ---------- - q : array_like - upper tail probability - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - x : ndarray or scalar - Quantile corresponding to the upper tail probability q. - - """ - args, loc, scale = self._parse_args(*args, **kwds) - q, loc, scale = map(asarray, (q, loc, scale)) - args = tuple(map(asarray, args)) - cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc) - cond1 = (0 < q) & (q < 1) - cond2 = cond0 & (q == 1) - cond3 = cond0 & (q == 0) - cond = cond0 & cond1 - output = valarray(shape(cond), value=self.badvalue) - - lower_bound = self.a * scale + loc - upper_bound = self.b * scale + loc - place(output, cond2, argsreduce(cond2, lower_bound)[0]) - place(output, cond3, argsreduce(cond3, upper_bound)[0]) - - if any(cond): - goodargs = argsreduce(cond, *((q,)+args+(scale, loc))) - scale, loc, goodargs = goodargs[-2], goodargs[-1], goodargs[:-2] - place(output, cond, self._isf(*goodargs) * scale + loc) - if output.ndim == 0: - return output[()] - return output - - def link(self, x, logSF, theta, i): - ''' - Return theta[i] as function of quantile, survival probability and - theta[j] for j!=i. - - Parameters - ---------- - x : quantile - logSF : logarithm of the survival probability - theta : list - all distribution parameters including location and scale. - - Returns - ------- - theta[i] : real scalar - fixed distribution parameter theta[i] as function of x, logSF and - theta[j] where j != i. - - LINK is a function connecting the fixed distribution parameter theta[i] - with the quantile (x) and the survival probability (SF) and the - remaining free distribution parameters theta[j] for j!=i, i.e.: - theta[i] = link(x, logSF, theta, i), - where logSF = log(Prob(X>x; theta)). - - See also - estimation.Profile - ''' - return self._link(x, logSF, theta, i) - - def _link(self, x, logSF, theta, i): - msg = ('Link function not implemented for the %s distribution' % - self.name) - raise NotImplementedError(msg) - - - def _nnlf(self, x, *args): - return -sum(self._logpdf(x, *args), axis=0) - - def nnlf(self, theta, x): - '''Return negative loglikelihood function - - Notes - ----- - This is ``-sum(log pdf(x, theta), axis=0)`` where theta are the - parameters (including loc and scale). - ''' - try: - loc = theta[-2] - scale = theta[-1] - args = tuple(theta[:-2]) - except IndexError: - raise ValueError("Not enough input arguments.") - if not self._argcheck(*args) or scale <= 0: - return inf - x = asarray((x-loc) / scale) - cond0 = (x <= self.a) | (self.b <= x) - if (any(cond0)): - return inf - else: - N = len(x) - return self._nnlf(x, *args) + N * log(scale) - - def _penalized_nnlf(self, theta, x): - ''' Return negative loglikelihood function, - i.e., - sum (log pdf(x, theta), axis=0) - where theta are the parameters (including loc and scale) - ''' - try: - loc = theta[-2] - scale = theta[-1] - args = tuple(theta[:-2]) - except IndexError: - raise ValueError("Not enough input arguments.") - if not self._argcheck(*args) or scale <= 0: - return inf - x = asarray((x-loc) / scale) - - loginf = log(_XMAX) - - if np.isneginf(self.a).all() and np.isinf(self.b).all(): - Nbad = 0 - else: - cond0 = (x <= self.a) | (self.b <= x) - Nbad = sum(cond0) - if Nbad > 0: - x = argsreduce(~cond0, x)[0] - - N = len(x) - return self._nnlf(x, *args) + N*log(scale) + Nbad * 100.0 * loginf - - def hessian_nnlf(self, theta, data, eps=None): - ''' approximate hessian of nnlf where theta are the parameters (including loc and scale) - ''' - #Nd = len(x) - np = len(theta) - # pab 07.01.2001: Always choose the stepsize h so that - # it is an exactly representable number. - # This is important when calculating numerical derivatives and is - # accomplished by the following. - - if eps == None: - eps = (_EPS) ** 0.4 - #xmin = floatinfo.machar.xmin - #myfun = lambda y: max(y,100.0*log(xmin)) #% trick to avoid log of zero - delta = (eps + 2.0) - 2.0 - delta2 = delta ** 2.0 - # Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with - # 1/(d^2 L(theta|x)/dtheta^2) - # using central differences - - LL = self.nnlf(theta, data) - H = zeros((np, np)) #%% Hessian matrix - theta = tuple(theta) - for ix in xrange(np): - sparam = list(theta) - sparam[ix] = theta[ix] + delta - fp = self.nnlf(sparam, data) - #fp = sum(myfun(x)) - - sparam[ix] = theta[ix] - delta - fm = self.nnlf(sparam, data) - #fm = sum(myfun(x)) - - H[ix, ix] = (fp - 2 * LL + fm) / delta2 - for iy in range(ix + 1, np): - sparam[ix] = theta[ix] + delta - sparam[iy] = theta[iy] + delta - fpp = self.nnlf(sparam, data) - #fpp = sum(myfun(x)) - - sparam[iy] = theta[iy] - delta - fpm = self.nnlf(sparam, data) - #fpm = sum(myfun(x)) - - sparam[ix] = theta[ix] - delta - fmm = self.nnlf(sparam, data) - #fmm = sum(myfun(x)); - - sparam[iy] = theta[iy] + delta - fmp = self.nnlf(sparam, data) - #fmp = sum(myfun(x)) - H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2) - H[iy, ix] = H[ix, iy] - sparam[iy] = theta[iy] - - # invert the Hessian matrix (i.e. invert the observed information number) - #pcov = -pinv(H); - return - H - - def nlogps(self, theta, x): - """ Moran's negative log Product Spacings statistic - - where theta are the parameters (including loc and scale) - - Note the data in x must be sorted - - References - ----------- - - R. C. H. Cheng; N. A. K. Amin (1983) - "Estimating Parameters in Continuous Univariate Distributions with a - Shifted Origin.", - Journal of the Royal Statistical Society. Series B (Methodological), - Vol. 45, No. 3. (1983), pp. 394-403. - - R. C. H. Cheng; M. A. Stephens (1989) - "A Goodness-Of-Fit Test Using Moran's Statistic with Estimated - Parameters", Biometrika, 76, 2, pp 385-392 - - Wong, T.S.T. and Li, W.K. (2006) - "A note on the estimation of extreme value distributions using maximum - product of spacings.", - IMS Lecture Notes Monograph Series 2006, Vol. 52, pp. 272-283 - """ - - try: - loc = theta[-2] - scale = theta[-1] - args = tuple(theta[:-2]) - except IndexError: - raise ValueError("Not enough input arguments.") - if not self._argcheck(*args) or scale <= 0: - return inf - x = asarray((x - loc) / scale) - cond0 = (x <= self.a) | (self.b <= x) - Nbad = sum(cond0) - if Nbad > 0: - x = argsreduce(~cond0, x)[0] - - lowertail = True - if lowertail: - prb = np.hstack((0.0, self.cdf(x, *args), 1.0)) - dprb = np.diff(prb) - else: - prb = np.hstack((1.0, self.sf(x, *args), 0.0)) - dprb = -np.diff(prb) - - logD = log(dprb) - dx = np.diff(x, axis=0) - tie = (dx == 0) - if any(tie): - # TODO : implement this method for treating ties in data: - # Assume measuring error is delta. Then compute - # yL = F(xi-delta,theta) - # yU = F(xi+delta,theta) - # and replace - # logDj = log((yU-yL)/(r-1)) for j = i+1,i+2,...i+r-1 - - # The following is OK when only minimization of T is wanted - i_tie, = np.nonzero(tie) - tiedata = x[i_tie] - logD[i_tie + 1] = log(self._pdf(tiedata, *args)) - log(scale) - - finiteD = np.isfinite(logD) - nonfiniteD = 1 - finiteD - Nbad += sum(nonfiniteD, axis=0) - if Nbad > 0: - T = -sum(logD[finiteD], axis=0) + 100.0 * log(_XMAX) * Nbad - else: - T = -sum(logD, axis=0) #Moran's negative log product spacing statistic - return T - - def hessian_nlogps(self, theta, data, eps=None): - ''' approximate hessian of nlogps where theta are the parameters (including loc and scale) - ''' - np = len(theta) - # pab 07.01.2001: Always choose the stepsize h so that - # it is an exactly representable number. - # This is important when calculating numerical derivatives and is - # accomplished by the following. - - if eps == None: - eps = (_EPS) ** 0.4 - #xmin = floatinfo.machar.xmin - #myfun = lambda y: max(y,100.0*log(xmin)) #% trick to avoid log of zero - delta = (eps + 2.0) - 2.0 - delta2 = delta ** 2.0 - # Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with - # 1/(d^2 L(theta|x)/dtheta^2) - # using central differences - - LL = self.nlogps(theta, data) - H = zeros((np, np)) # Hessian matrix - theta = tuple(theta) - for ix in xrange(np): - sparam = list(theta) - sparam[ix] = theta[ix] + delta - fp = self.nlogps(sparam, data) - #fp = sum(myfun(x)) - - sparam[ix] = theta[ix] - delta - fm = self.nlogps(sparam, data) - #fm = sum(myfun(x)) - - H[ix, ix] = (fp - 2 * LL + fm) / delta2 - for iy in range(ix + 1, np): - sparam[ix] = theta[ix] + delta - sparam[iy] = theta[iy] + delta - fpp = self.nlogps(sparam, data) - #fpp = sum(myfun(x)) - - sparam[iy] = theta[iy] - delta - fpm = self.nlogps(sparam, data) - #fpm = sum(myfun(x)) - - sparam[ix] = theta[ix] - delta - fmm = self.nlogps(sparam, data) - #fmm = sum(myfun(x)); - - sparam[iy] = theta[iy] + delta - fmp = self.nlogps(sparam, data) - #fmp = sum(myfun(x)) - H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2) - H[iy, ix] = H[ix, iy] - sparam[iy] = theta[iy]; - - # invert the Hessian matrix (i.e. invert the observed information number) - #pcov = -pinv(H); - return - H - - # return starting point for fit (shape arguments + loc + scale) - def _fitstart(self, data, args=None): - if args is None: - args = (1.0,)*self.numargs - return args + self.fit_loc_scale(data, *args) - - # Return the (possibly reduced) function to optimize in order to find MLE - # estimates for the .fit method - def _reduce_func(self, args, kwds): - args = list(args) - Nargs = len(args) - fixedn = [] - index = list(range(Nargs)) - names = ['f%d' % n for n in range(Nargs - 2)] + ['floc', 'fscale'] - x0 = [] - for n, key in zip(index, names): - if key in kwds: - fixedn.append(n) - args[n] = kwds[key] - else: - x0.append(args[n]) - method = kwds.get('method', 'ml').lower() - if method.startswith('mps'): - fitfun = self.nlogps - else: - fitfun = self._penalized_nnlf - - if len(fixedn) == 0: - func = fitfun - restore = None - else: - if len(fixedn) == len(index): - raise ValueError( - "All parameters fixed. There is nothing to optimize.") - - def restore(args, theta): - # Replace with theta for all numbers not in fixedn - # This allows the non-fixed values to vary, but - # we still call self.nnlf with all parameters. - i = 0 - for n in range(Nargs): - if n not in fixedn: - args[n] = theta[i] - i += 1 - return args - - def func(theta, x): - newtheta = restore(args[:], theta) - return fitfun(newtheta, x) - - return x0, func, restore, args - - def fit(self, data, *args, **kwds): - """ - Return MLEs for shape, location, and scale parameters from data. - - MLE stands for Maximum Likelihood Estimate. Starting estimates for - the fit are given by input arguments; for any arguments not provided - with starting estimates, ``self._fitstart(data)`` is called to generate - such. - - One can hold some parameters fixed to specific values by passing in - keyword arguments ``f0``, ``f1``, ..., ``fn`` (for shape parameters) - and ``floc`` and ``fscale`` (for location and scale parameters, - respectively). - - Parameters - ---------- - data : array_like - Data to use in calculating the MLEs. - args : floats, optional - Starting value(s) for any shape-characterizing arguments (those not - provided will be determined by a call to ``_fitstart(data)``). - No default value. - kwds : floats, optional - Starting values for the location and scale parameters; no default. - Special keyword arguments are recognized as holding certain - parameters fixed: - - f0...fn : hold respective shape parameters fixed. - - floc : hold location parameter fixed to specified value. - - fscale : hold scale parameter fixed to specified value. - - optimizer : The optimizer to use. The optimizer must take func, - and starting position as the first two arguments, - plus args (for extra arguments to pass to the - function to be optimized) and disp=0 to suppress - output as keyword arguments. - - Returns - ------- - shape, loc, scale : tuple of floats - MLEs for any shape statistics, followed by those for location and - scale. - - Notes - ----- - This fit is computed by maximizing a log-likelihood function, with - penalty applied for samples outside of range of the distribution. The - returned answer is not guaranteed to be the globally optimal MLE, it - may only be locally optimal, or the optimization may fail altogether. - """ - Narg = len(args) - if Narg > self.numargs: - raise TypeError("Too many input arguments.") - - start = [None]*2 - if (Narg < self.numargs) or not ('loc' in kwds and - 'scale' in kwds): - # get distribution specific starting locations - start = self._fitstart(data) - args += start[Narg:-2] - loc = kwds.get('loc', start[-2]) - scale = kwds.get('scale', start[-1]) - args += (loc, scale) - x0, func, restore, args = self._reduce_func(args, kwds) - - optimizer = kwds.get('optimizer', optimize.fmin) - # convert string to function in scipy.optimize - if not callable(optimizer) and isinstance(optimizer, string_types): - if not optimizer.startswith('fmin_'): - optimizer = "fmin_"+optimizer - if optimizer == 'fmin_': - optimizer = 'fmin' - try: - optimizer = getattr(optimize, optimizer) - except AttributeError: - raise ValueError("%s is not a valid optimizer" % optimizer) - vals = optimizer(func, x0, args=(ravel(data),), disp=0) - if restore is not None: - vals = restore(args, vals) - vals = tuple(vals) - return vals - - def fit2(self, data, *args, **kwds): - ''' Return Maximum Likelihood or Maximum Product Spacing estimator object - - Parameters - ---------- - data : array-like - Data to use in calculating the ML or MPS estimators - args : optional - Starting values for any shape arguments (those not specified - will be determined by dist._fitstart(data)) - kwds : loc, scale - Starting values for the location and scale parameters - Special keyword arguments are recognized as holding certain - parameters fixed: - f0..fn : hold respective shape paramters fixed - floc : hold location parameter fixed to specified value - fscale : hold scale parameter fixed to specified value - method : of estimation. Options are - 'ml' : Maximum Likelihood method (default) - 'mps': Maximum Product Spacing method - alpha : scalar, optional - Confidence coefficent (default=0.05) - search : bool - If true search for best estimator (default), - otherwise return object with initial distribution parameters - copydata : bool - If true copydata (default) - optimizer : The optimizer to use. The optimizer must take func, - and starting position as the first two arguments, - plus args (for extra arguments to pass to the - function to be optimized) and disp=0 to suppress - output as keyword arguments. - - Return - ------ - phat : FitDistribution object - Fitted distribution object with following member variables: - LLmax : loglikelihood function evaluated using par - LPSmax : log product spacing function evaluated using par - pvalue : p-value for the fit - par : distribution parameters (fixed and fitted) - par_cov : covariance of distribution parameters - par_fix : fixed distribution parameters - par_lower : lower (1-alpha)% confidence bound for the parameters - par_upper : upper (1-alpha)% confidence bound for the parameters - - Note - ---- - `data` is sorted using this function, so if `copydata`==False the data - in your namespace will be sorted as well. - ''' - return FitDistribution(self, data, *args, **kwds) - - def fit_loc_scale(self, data, *args): - """ - Estimate loc and scale parameters from data using 1st and 2nd moments. - - Parameters - ---------- - data : array_like - Data to fit. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - - Returns - ------- - Lhat : float - Estimated location parameter for the data. - Shat : float - Estimated scale parameter for the data. - - """ - mu, mu2 = self.stats(*args, **{'moments': 'mv'}) - tmp = asarray(data) - muhat = tmp.mean() - mu2hat = tmp.var() - Shat = sqrt(mu2hat / mu2) - Lhat = muhat - Shat*mu - if not np.isfinite(Lhat): - Lhat = 0 - if not (np.isfinite(Shat) and (0 < Shat)): - Shat = 1 - return Lhat, Shat - - @np.deprecate - def est_loc_scale(self, data, *args): - """This function is deprecated, use self.fit_loc_scale(data) instead. - """ - return self.fit_loc_scale(data, *args) - - def _entropy(self, *args): - def integ(x): - val = self._pdf(x, *args) - return -xlogy(val, val) - - # upper limit is often inf, so suppress warnings when integrating - olderr = np.seterr(over='ignore') - h = integrate.quad(integ, self.a, self.b)[0] - np.seterr(**olderr) - - if not np.isnan(h): - return h - else: - # try with different limits if integration problems - low, upp = self.ppf([1e-10, 1. - 1e-10], *args) - if np.isinf(self.b): - upper = upp - else: - upper = self.b - if np.isinf(self.a): - lower = low - else: - lower = self.a - return integrate.quad(integ, lower, upper)[0] - - def entropy(self, *args, **kwds): - """ - Differential entropy of the RV. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - scale : array_like, optional - Scale parameter (default=1). - - """ - args, loc, scale = self._parse_args(*args, **kwds) - args = tuple(map(asarray, args)) - cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc) - output = zeros(shape(cond0), 'd') - place(output, (1-cond0), self.badvalue) - goodargs = argsreduce(cond0, *args) - # np.vectorize doesn't work when numargs == 0 in numpy 1.5.1 - if self.numargs == 0: - place(output, cond0, self._entropy() + log(scale)) - else: - place(output, cond0, self.vecentropy(*goodargs) + log(scale)) - - return output - - def expect(self, func=None, args=(), loc=0, scale=1, lb=None, ub=None, - conditional=False, **kwds): - """Calculate expected value of a function with respect to the - distribution. - - The expected value of a function ``f(x)`` with respect to a - distribution ``dist`` is defined as:: - - ubound - E[x] = Integral(f(x) * dist.pdf(x)) - lbound - - Parameters - ---------- - func : callable, optional - Function for which integral is calculated. Takes only one argument. - The default is the identity mapping f(x) = x. - args : tuple, optional - Argument (parameters) of the distribution. - lb, ub : scalar, optional - Lower and upper bound for integration. default is set to the - support of the distribution. - conditional : bool, optional - If True, the integral is corrected by the conditional probability - of the integration interval. The return value is the expectation - of the function, conditional on being in the given interval. - Default is False. - - Additional keyword arguments are passed to the integration routine. - - Returns - ------- - expect : float - The calculated expected value. - - Notes - ----- - The integration behavior of this function is inherited from - `integrate.quad`. - - """ - lockwds = {'loc': loc, - 'scale': scale} - self._argcheck(*args) - if func is None: - def fun(x, *args): - return x * self.pdf(x, *args, **lockwds) - else: - def fun(x, *args): - return func(x) * self.pdf(x, *args, **lockwds) - if lb is None: - lb = loc + self.a * scale - if ub is None: - ub = loc + self.b * scale - if conditional: - invfac = (self.sf(lb, *args, **lockwds) - - self.sf(ub, *args, **lockwds)) - else: - invfac = 1.0 - kwds['args'] = args - # Silence floating point warnings from integration. - olderr = np.seterr(all='ignore') - vals = integrate.quad(fun, lb, ub, **kwds)[0] / invfac - np.seterr(**olderr) - return vals - - -## Handlers for generic case where xk and pk are given -## The _drv prefix probably means discrete random variable. - -def _drv_pmf(self, xk, *args): try: - return self.P[xk] - except KeyError: - return 0.0 - - -def _drv_cdf(self, xk, *args): - indx = argmax((self.xk > xk), axis=-1)-1 - return self.F[self.xk[indx]] - - -def _drv_ppf(self, q, *args): - indx = argmax((self.qvals >= q), axis=-1) - return self.Finv[self.qvals[indx]] - - -def _drv_nonzero(self, k, *args): - return 1 - - -def _drv_moment(self, n, *args): - n = asarray(n) - return sum(self.xk**n[np.newaxis, ...] * self.pk, axis=0) - - -def _drv_moment_gen(self, t, *args): - t = asarray(t) - return sum(exp(self.xk * t[np.newaxis, ...]) * self.pk, axis=0) - - -def _drv2_moment(self, n, *args): - """Non-central moment of discrete distribution.""" - # many changes, originally not even a return - tot = 0.0 - diff = 1e100 - # pos = self.a - pos = max(0.0, 1.0*self.a) - count = 0 - # handle cases with infinite support - ulimit = max(1000, (min(self.b, 1000) + max(self.a, -1000))/2.0) - llimit = min(-1000, (min(self.b, 1000) + max(self.a, -1000))/2.0) - - while (pos <= self.b) and ((pos <= ulimit) or - (diff > self.moment_tol)): - diff = np.power(pos, n) * self.pmf(pos, *args) - # use pmf because _pmf does not check support in randint and there - # might be problems ? with correct self.a, self.b at this stage - tot += diff - pos += self.inc - count += 1 - - if self.a < 0: # handle case when self.a = -inf - diff = 1e100 - pos = -self.inc - while (pos >= self.a) and ((pos >= llimit) or - (diff > self.moment_tol)): - diff = np.power(pos, n) * self.pmf(pos, *args) - # using pmf instead of _pmf, see above - tot += diff - pos -= self.inc - count += 1 - return tot - - -def _drv2_ppfsingle(self, q, *args): # Use basic bisection algorithm - b = self.b - a = self.a - if isinf(b): # Be sure ending point is > q - b = int(max(100*q, 10)) - while 1: - if b >= self.b: - qb = 1.0 - break - qb = self._cdf(b, *args) - if (qb < q): - b += 10 - else: - break + loc = theta[-2] + scale = theta[-1] + args = tuple(theta[:-2]) + except IndexError: + raise ValueError("Not enough input arguments.") + if not self._argcheck(*args) or scale <= 0: + return inf + x = asarray((x - loc) / scale) + cond0 = (x <= self.a) | (self.b <= x) + Nbad = np.sum(cond0) + if Nbad > 0: + x = argsreduce(~cond0, x)[0] + + lowertail = True + if lowertail: + prb = np.hstack((0.0, self.cdf(x, *args), 1.0)) + dprb = np.diff(prb) else: - qb = 1.0 - if isinf(a): # be sure starting point < q - a = int(min(-100*q, -10)) - while 1: - if a <= self.a: - qb = 0.0 - break - qa = self._cdf(a, *args) - if (qa > q): - a -= 10 - else: - break + prb = np.hstack((1.0, self.sf(x, *args), 0.0)) + dprb = -np.diff(prb) + + logD = log(dprb) + dx = np.diff(x, axis=0) + tie = (dx == 0) + if any(tie): + # TODO : implement this method for treating ties in data: + # Assume measuring error is delta. Then compute + # yL = F(xi-delta,theta) + # yU = F(xi+delta,theta) + # and replace + # logDj = log((yU-yL)/(r-1)) for j = i+1,i+2,...i+r-1 + + # The following is OK when only minimization of T is wanted + i_tie, = np.nonzero(tie) + tiedata = x[i_tie] + logD[i_tie + 1] = log(self._pdf(tiedata, *args)) - log(scale) + + finiteD = np.isfinite(logD) + nonfiniteD = 1 - finiteD + Nbad += np.sum(nonfiniteD, axis=0) + if Nbad > 0: + T = -np.sum(logD[finiteD], axis=0) + 100.0 * np.log(_XMAX) * Nbad else: - qa = self._cdf(a, *args) - - while 1: - if (qa == q): - return a - if (qb == q): - return b - if b <= a+1: - # testcase: return wrong number at lower index - # python -c "from scipy.stats import zipf;print zipf.ppf(0.01, 2)" wrong - # python -c "from scipy.stats import zipf;print zipf.ppf([0.01, 0.61, 0.77, 0.83], 2)" - # python -c "from scipy.stats import logser;print logser.ppf([0.1, 0.66, 0.86, 0.93], 0.6)" - if qa > q: - return a - else: - return b - c = int((a+b)/2.0) - qc = self._cdf(c, *args) - if (qc < q): - if a != c: - a = c - else: - raise RuntimeError('updating stopped, endless loop') - qa = qc - elif (qc > q): - if b != c: - b = c - else: - raise RuntimeError('updating stopped, endless loop') - qb = qc + T = -np.sum(logD, axis=0) + return T + + +def _reduce_func(self, args, kwds): + # First of all, convert fshapes params to fnum: eg for stats.beta, + # shapes='a, b'. To fix `a`, can specify either `f1` or `fa`. + # Convert the latter into the former. + if self.shapes: + shapes = self.shapes.replace(',', ' ').split() + for j, s in enumerate(shapes): + val = kwds.pop('f' + s, None) or kwds.pop('fix_' + s, None) + if val is not None: + key = 'f%d' % j + if key in kwds: + raise ValueError("Duplicate entry for %s." % key) + else: + kwds[key] = val + + args = list(args) + Nargs = len(args) + fixedn = [] + names = ['f%d' % n for n in range(Nargs - 2)] + ['floc', 'fscale'] + x0 = [] + for n, key in enumerate(names): + if key in kwds: + fixedn.append(n) + args[n] = kwds.pop(key) else: - return c - - -def entropy(pk, qk=None, base=None): - """Calculate the entropy of a distribution for given probability values. + x0.append(args[n]) + method = kwds.pop('method', 'ml').lower() + if method.startswith('mps'): + fitfun = self.nlogps + else: + fitfun = self._penalized_nnlf - If only probabilities `pk` are given, the entropy is calculated as - ``S = -sum(pk * log(pk), axis=0)``. + if len(fixedn) == 0: + func = fitfun + restore = None + else: + if len(fixedn) == Nargs: + raise ValueError( + "All parameters fixed. There is nothing to optimize.") - If `qk` is not None, then compute the Kullback-Leibler divergence - ``S = sum(pk * log(pk / qk), axis=0)``. + def restore(args, theta): + # Replace with theta for all numbers not in fixedn + # This allows the non-fixed values to vary, but + # we still call self.nnlf with all parameters. + i = 0 + for n in range(Nargs): + if n not in fixedn: + args[n] = theta[i] + i += 1 + return args - This routine will normalize `pk` and `qk` if they don't sum to 1. + def func(theta, x): + newtheta = restore(args[:], theta) + return fitfun(newtheta, x) - Parameters - ---------- - pk : sequence - Defines the (discrete) distribution. ``pk[i]`` is the (possibly - unnormalized) probability of event ``i``. - qk : sequence, optional - Sequence against which the relative entropy is computed. Should be in - the same format as `pk`. - base : float, optional - The logarithmic base to use, defaults to ``e`` (natural logarithm). + return x0, func, restore, args - Returns - ------- - S : float - The calculated entropy. +def fit(self, data, *args, **kwds): """ - pk = asarray(pk) - pk = 1.0*pk / sum(pk, axis=0) - if qk is None: - vec = xlogy(pk, pk) - else: - qk = asarray(qk) - if len(qk) != len(pk): - raise ValueError("qk and pk must have same length.") - qk = 1.0*qk / sum(qk, axis=0) - # If qk is zero anywhere, then unless pk is zero at those places - # too, the relative entropy is infinite. - mask = qk == 0.0 - qk[mask] = 1.0 # Avoid the divide-by-zero warning - quotient = pk / qk - vec = -xlogy(pk, quotient) - vec[mask & (pk != 0.0)] = -inf - vec[mask & (pk == 0.0)] = 0.0 - S = -sum(vec, axis=0) - if base is not None: - S /= log(base) - return S - - -# Must over-ride one of _pmf or _cdf or pass in -# x_k, p(x_k) lists in initialization - -class rv_discrete(rv_generic): - """ - A generic discrete random variable class meant for subclassing. + Return ML or MPS estimate for shape, location, and scale parameters from data. - `rv_discrete` is a base class to construct specific distribution classes - and instances from for discrete random variables. rv_discrete can be used - to construct an arbitrary distribution with defined by a list of support - points and the corresponding probabilities. + ML and MPS stands for Maximum Likelihood and Maximum Product Spacing, + respectively. Starting estimates for + the fit are given by input arguments; for any arguments not provided + with starting estimates, ``self._fitstart(data)`` is called to generate + such. + + One can hold some parameters fixed to specific values by passing in + keyword arguments ``f0``, ``f1``, ..., ``fn`` (for shape parameters) + and ``floc`` and ``fscale`` (for location and scale parameters, + respectively). Parameters ---------- - a : float, optional - Lower bound of the support of the distribution, default: 0 - b : float, optional - Upper bound of the support of the distribution, default: plus infinity - moment_tol : float, optional - The tolerance for the generic calculation of moments - values : tuple of two array_like - (xk, pk) where xk are points (integers) with positive probability pk - with sum(pk) = 1 - inc : integer - increment for the support of the distribution, default: 1 - other values have not been tested - badvalue : object, optional - The value in (masked) arrays that indicates a value that should be - ignored. - name : str, optional - The name of the instance. This string is used to construct the default - example for distributions. - longname : str, optional - This string is used as part of the first line of the docstring returned - when a subclass has no docstring of its own. Note: `longname` exists - for backwards compatibility, do not use for new subclasses. - shapes : str, optional - The shape of the distribution. For example ``"m, n"`` for a - distribution that takes two integers as the first two arguments for all - its methods. - extradoc : str, optional - This string is used as the last part of the docstring returned when a - subclass has no docstring of its own. Note: `extradoc` exists for - backwards compatibility, do not use for new subclasses. + data : array_like + Data to use in calculating the MLEs. + args : floats, optional + Starting value(s) for any shape-characterizing arguments (those not + provided will be determined by a call to ``_fitstart(data)``). + No default value. + kwds : floats, optional + Starting values for the location and scale parameters; no default. + Special keyword arguments are recognized as holding certain + parameters fixed: + + - f0...fn : hold respective shape parameters fixed. + Alternatively, shape parameters to fix can be specified by name. + For example, if ``self.shapes == "a, b"``, ``fa``and ``fix_a`` + are equivalent to ``f0``, and ``fb`` and ``fix_b`` are + equivalent to ``f1``. + + - floc : hold location parameter fixed to specified value. + + - fscale : hold scale parameter fixed to specified value. + + - optimizer : The optimizer to use. The optimizer must take ``func``, + and starting position as the first two arguments, + plus ``args`` (for extra arguments to pass to the + function to be optimized) and ``disp=0`` to suppress + output as keyword arguments. - Methods + Returns ------- - ``generic.rvs(, loc=0, size=1)`` - random variates - - ``generic.pmf(x, , loc=0)`` - probability mass function - - ``logpmf(x, , loc=0)`` - log of the probability density function - - ``generic.cdf(x, , loc=0)`` - cumulative density function - - ``generic.logcdf(x, , loc=0)`` - log of the cumulative density function - - ``generic.sf(x, , loc=0)`` - survival function (1-cdf --- sometimes more accurate) - - ``generic.logsf(x, , loc=0, scale=1)`` - log of the survival function - - ``generic.ppf(q, , loc=0)`` - percent point function (inverse of cdf --- percentiles) - - ``generic.isf(q, , loc=0)`` - inverse survival function (inverse of sf) - - ``generic.moment(n, , loc=0)`` - non-central n-th moment of the distribution. May not work for array - arguments. - - ``generic.stats(, loc=0, moments='mv')`` - mean('m', axis=0), variance('v'), skew('s'), and/or kurtosis('k') - - ``generic.entropy(, loc=0)`` - entropy of the RV - - ``generic.expect(func=None, args=(), loc=0, lb=None, ub=None, conditional=False)`` - Expected value of a function with respect to the distribution. - Additional kwd arguments passed to integrate.quad - - ``generic.median(, loc=0)`` - Median of the distribution. - - ``generic.mean(, loc=0)`` - Mean of the distribution. - - ``generic.std(, loc=0)`` - Standard deviation of the distribution. - - ``generic.var(, loc=0)`` - Variance of the distribution. - - ``generic.interval(alpha, , loc=0)`` - Interval that with `alpha` percent probability contains a random - realization of this distribution. - - ``generic(, loc=0)`` - calling a distribution instance returns a frozen distribution + shape, loc, scale : tuple of floats + MLEs for any shape statistics, followed by those for location and + scale. Notes ----- + This fit is computed by maximizing a log-likelihood function, with + penalty applied for samples outside of range of the distribution. The + returned answer is not guaranteed to be the globally optimal MLE, it + may only be locally optimal, or the optimization may fail altogether. - You can construct an arbitrary discrete rv where ``P{X=xk} = pk`` - by passing to the rv_discrete initialization method (through the - values=keyword) a tuple of sequences (xk, pk) which describes only those - values of X (xk) that occur with nonzero probability (pk). - To create a new discrete distribution, we would do the following:: - - class poisson_gen(rv_discrete): - # "Poisson distribution" - def _pmf(self, k, mu): - ... - - and create an instance:: - - poisson = poisson_gen(name="poisson", - longname='A Poisson') + Examples + -------- - The docstring can be created from a template. + Generate some data to fit: draw random variates from the `beta` + distribution - Alternatively, the object may be called (as a function) to fix the shape - and location parameters returning a "frozen" discrete RV object:: + >>> from wafo.stats import beta + >>> a, b = 1., 2. + >>> x = beta.rvs(a, b, size=1000) - myrv = generic(, loc=0) - - frozen RV object with the same methods but holding the given - shape and location fixed. + Now we can fit all four parameters (``a``, ``b``, ``loc`` and ``scale``): - A note on ``shapes``: subclasses need not specify them explicitly. In this - case, the `shapes` will be automatically deduced from the signatures of the - overridden methods. - If, for some reason, you prefer to avoid relying on introspection, you can - specify ``shapes`` explicitly as an argument to the instance constructor. + >>> a1, b1, loc1, scale1 = beta.fit(x) + We can also use some prior knowledge about the dataset: let's keep + ``loc`` and ``scale`` fixed: - Examples - -------- + >>> a1, b1, loc1, scale1 = beta.fit(x, floc=0, fscale=1) + >>> loc1, scale1 + (0, 1) - Custom made discrete distribution: + We can also keep shape parameters fixed by using ``f``-keywords. To + keep the zero-th shape parameter ``a`` equal 1, use ``f0=1`` or, + equivalently, ``fa=1``: - >>> from scipy import stats - >>> xk = np.arange(7) - >>> pk = (0.1, 0.2, 0.3, 0.1, 0.1, 0.0, 0.2) - >>> custm = stats.rv_discrete(name='custm', values=(xk, pk)) - >>> - >>> import matplotlib.pyplot as plt - >>> fig, ax = plt.subplots(1, 1) - >>> ax.plot(xk, custm.pmf(xk), 'ro', ms=12, mec='r') - >>> ax.vlines(xk, 0, custm.pmf(xk), colors='r', lw=4) - >>> plt.show() + >>> a1, b1, loc1, scale1 = beta.fit(x, fa=1, floc=0, fscale=1) + >>> a1 + 1 - Random number generation: + """ + Narg = len(args) + if Narg > self.numargs: + raise TypeError("Too many input arguments.") + + start = [None]*2 + if (Narg < self.numargs) or not ('loc' in kwds and + 'scale' in kwds): + # get distribution specific starting locations + start = self._fitstart(data) + args += start[Narg:-2] + loc = kwds.pop('loc', start[-2]) + scale = kwds.pop('scale', start[-1]) + args += (loc, scale) + x0, func, restore, args = self._reduce_func(args, kwds) + + optimizer = kwds.pop('optimizer', optimize.fmin) + # convert string to function in scipy.optimize + if not callable(optimizer) and isinstance(optimizer, string_types): + if not optimizer.startswith('fmin_'): + optimizer = "fmin_"+optimizer + if optimizer == 'fmin_': + optimizer = 'fmin' + try: + optimizer = getattr(optimize, optimizer) + except AttributeError: + raise ValueError("%s is not a valid optimizer" % optimizer) - >>> R = custm.rvs(size=100) + # by now kwds must be empty, since everybody took what they needed + if kwds: + raise TypeError("Unknown arguments: %s." % kwds) - Check accuracy of cdf and ppf: + vals = optimizer(func, x0, args=(ravel(data),), disp=0) + if restore is not None: + vals = restore(args, vals) + vals = tuple(vals) + return vals - >>> prb = custm.cdf(x, ) - >>> h = plt.semilogy(np.abs(x-custm.ppf(prb, ))+1e-20) - """ - def __init__(self, a=0, b=inf, name=None, badvalue=None, - moment_tol=1e-8, values=None, inc=1, longname=None, - shapes=None, extradoc=None): - - super(rv_discrete, self).__init__() - - # cf generic freeze - self._ctor_param = dict( - a=a, b=b, name=name, badvalue=badvalue, - moment_tol=moment_tol, values=values, inc=inc, - longname=longname, shapes=shapes, extradoc=extradoc) - - if badvalue is None: - badvalue = nan - if name is None: - name = 'Distribution' - self.badvalue = badvalue - self.a = a - self.b = b - self.name = name - self.moment_tol = moment_tol - self.inc = inc - self._cdfvec = vectorize(self._cdf_single, otypes='d') - self.return_integers = 1 - self.vecentropy = vectorize(self._entropy) - self.shapes = shapes - self.extradoc = extradoc - - if values is not None: - self.xk, self.pk = values - self.return_integers = 0 - indx = argsort(ravel(self.xk)) - self.xk = take(ravel(self.xk), indx, 0) - self.pk = take(ravel(self.pk), indx, 0) - self.a = self.xk[0] - self.b = self.xk[-1] - self.P = dict(zip(self.xk, self.pk)) - self.qvals = np.cumsum(self.pk, axis=0) - self.F = dict(zip(self.xk, self.qvals)) - decreasing_keys = sorted(self.F.keys(), reverse=True) - self.Finv = dict((self.F[k], k) for k in decreasing_keys) - self._ppf = instancemethod(vectorize(_drv_ppf, otypes='d'), - self, rv_discrete) - self._pmf = instancemethod(vectorize(_drv_pmf, otypes='d'), - self, rv_discrete) - self._cdf = instancemethod(vectorize(_drv_cdf, otypes='d'), - self, rv_discrete) - self._nonzero = instancemethod(_drv_nonzero, self, rv_discrete) - self.generic_moment = instancemethod(_drv_moment, - self, rv_discrete) - self.moment_gen = instancemethod(_drv_moment_gen, - self, rv_discrete) - self._construct_argparser(meths_to_inspect=[_drv_pmf], - locscale_in='loc=0', - # scale=1 for discrete RVs - locscale_out='loc, 1') - else: - self._construct_argparser(meths_to_inspect=[self._pmf, self._cdf], - locscale_in='loc=0', - # scale=1 for discrete RVs - locscale_out='loc, 1') - - # nin correction needs to be after we know numargs - # correct nin for generic moment vectorization - _vec_generic_moment = vectorize(_drv2_moment, otypes='d') - _vec_generic_moment.nin = self.numargs + 2 - self.generic_moment = instancemethod(_vec_generic_moment, - self, rv_discrete) - # backwards compat. was removed in 0.14.0, put back but - # deprecated in 0.14.1: - self.vec_generic_moment = np.deprecate(_vec_generic_moment, - "vec_generic_moment", - "generic_moment") - - # correct nin for ppf vectorization - _vppf = vectorize(_drv2_ppfsingle, otypes='d') - _vppf.nin = self.numargs + 2 # +1 is for self - self._ppfvec = instancemethod(_vppf, - self, rv_discrete) - - # now that self.numargs is defined, we can adjust nin - self._cdfvec.nin = self.numargs + 1 - - # generate docstring for subclass instances - if longname is None: - if name[0] in ['aeiouAEIOU']: - hstr = "An " - else: - hstr = "A " - longname = hstr + name - - if sys.flags.optimize < 2: - # Skip adding docstrings if interpreter is run with -OO - if self.__doc__ is None: - self._construct_default_doc(longname=longname, - extradoc=extradoc) - else: - dct = dict(distdiscrete) - self._construct_doc(docdict_discrete, dct.get(self.name)) - - #discrete RV do not have the scale parameter, remove it - self.__doc__ = self.__doc__.replace( - '\n scale : array_like, ' - 'optional\n scale parameter (default=1)', '') - - def _construct_default_doc(self, longname=None, extradoc=None): - """Construct instance docstring from the rv_discrete template.""" - if extradoc is None: - extradoc = '' - if extradoc.startswith('\n\n'): - extradoc = extradoc[2:] - self.__doc__ = ''.join(['%s discrete random variable.' % longname, - '\n\n%(before_notes)s\n', docheaders['notes'], - extradoc, '\n%(example)s']) - self._construct_doc(docdict_discrete) - - def _nonzero(self, k, *args): - return floor(k) == k - - def _pmf(self, k, *args): - return self._cdf(k, *args) - self._cdf(k-1, *args) - - def _logpmf(self, k, *args): - return log(self._pmf(k, *args)) - - def _cdf_single(self, k, *args): - m = arange(int(self.a), k+1) - return sum(self._pmf(m, *args), axis=0) - - def _cdf(self, x, *args): - k = floor(x) - return self._cdfvec(k, *args) - - # generic _logcdf, _sf, _logsf, _ppf, _isf, _rvs defined in rv_generic - - def rvs(self, *args, **kwargs): - """ - Random variates of given type. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - size : int or tuple of ints, optional - Defining number of random variates (default=1). Note that `size` - has to be given as keyword, not as positional argument. - - Returns - ------- - rvs : ndarray or scalar - Random variates of given `size`. - - """ - kwargs['discrete'] = True - return super(rv_discrete, self).rvs(*args, **kwargs) - - def pmf(self, k, *args, **kwds): - """ - Probability mass function at k of the given RV. - - Parameters - ---------- - k : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - Location parameter (default=0). - - Returns - ------- - pmf : array_like - Probability mass function evaluated at k - - """ - args, loc, _ = self._parse_args(*args, **kwds) - k, loc = map(asarray, (k, loc)) - args = tuple(map(asarray, args)) - k = asarray((k-loc)) - cond0 = self._argcheck(*args) - cond1 = (k >= self.a) & (k <= self.b) & self._nonzero(k, *args) - cond = cond0 & cond1 - output = zeros(shape(cond), 'd') - place(output, (1-cond0) + np.isnan(k), self.badvalue) - if any(cond): - goodargs = argsreduce(cond, *((k,)+args)) - place(output, cond, np.clip(self._pmf(*goodargs), 0, 1)) - if output.ndim == 0: - return output[()] - return output - - def logpmf(self, k, *args, **kwds): - """ - Log of the probability mass function at k of the given RV. - - Parameters - ---------- - k : array_like - Quantiles. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter. Default is 0. - - Returns - ------- - logpmf : array_like - Log of the probability mass function evaluated at k. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - k, loc = map(asarray, (k, loc)) - args = tuple(map(asarray, args)) - k = asarray((k-loc)) - cond0 = self._argcheck(*args) - cond1 = (k >= self.a) & (k <= self.b) & self._nonzero(k, *args) - cond = cond0 & cond1 - output = empty(shape(cond), 'd') - output.fill(NINF) - place(output, (1-cond0) + np.isnan(k), self.badvalue) - if any(cond): - goodargs = argsreduce(cond, *((k,)+args)) - place(output, cond, self._logpmf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def cdf(self, k, *args, **kwds): - """ - Cumulative distribution function of the given RV. - - Parameters - ---------- - k : array_like, int - Quantiles. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - - Returns - ------- - cdf : ndarray - Cumulative distribution function evaluated at `k`. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - k, loc = map(asarray, (k, loc)) - args = tuple(map(asarray, args)) - k = asarray((k-loc)) - cond0 = self._argcheck(*args) - cond1 = (k >= self.a) & (k < self.b) - cond2 = (k >= self.b) - cond = cond0 & cond1 - output = zeros(shape(cond), 'd') - place(output, (1-cond0) + np.isnan(k), self.badvalue) - place(output, cond2*(cond0 == cond0), 1.0) - - if any(cond): - goodargs = argsreduce(cond, *((k,)+args)) - place(output, cond, np.clip(self._cdf(*goodargs), 0, 1)) - if output.ndim == 0: - return output[()] - return output - - def logcdf(self, k, *args, **kwds): - """ - Log of the cumulative distribution function at k of the given RV - - Parameters - ---------- - k : array_like, int - Quantiles. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - - Returns - ------- - logcdf : array_like - Log of the cumulative distribution function evaluated at k. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - k, loc = map(asarray, (k, loc)) - args = tuple(map(asarray, args)) - k = asarray((k-loc)) - cond0 = self._argcheck(*args) - cond1 = (k >= self.a) & (k < self.b) - cond2 = (k >= self.b) - cond = cond0 & cond1 - output = empty(shape(cond), 'd') - output.fill(NINF) - place(output, (1-cond0) + np.isnan(k), self.badvalue) - place(output, cond2*(cond0 == cond0), 0.0) - - if any(cond): - goodargs = argsreduce(cond, *((k,)+args)) - place(output, cond, self._logcdf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def sf(self, k, *args, **kwds): - """ - Survival function (1-cdf) at k of the given RV. - - Parameters - ---------- - k : array_like - Quantiles. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - - Returns - ------- - sf : array_like - Survival function evaluated at k. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - k, loc = map(asarray, (k, loc)) - args = tuple(map(asarray, args)) - k = asarray(k-loc) - cond0 = self._argcheck(*args) - cond1 = (k >= self.a) & (k <= self.b) - cond2 = (k < self.a) & cond0 - cond = cond0 & cond1 - output = zeros(shape(cond), 'd') - place(output, (1-cond0) + np.isnan(k), self.badvalue) - place(output, cond2, 1.0) - if any(cond): - goodargs = argsreduce(cond, *((k,)+args)) - place(output, cond, np.clip(self._sf(*goodargs), 0, 1)) - if output.ndim == 0: - return output[()] - return output - - def logsf(self, k, *args, **kwds): - """ - Log of the survival function of the given RV. - - Returns the log of the "survival function," defined as ``1 - cdf``, - evaluated at `k`. - - Parameters - ---------- - k : array_like - Quantiles. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - - Returns - ------- - logsf : ndarray - Log of the survival function evaluated at `k`. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - k, loc = map(asarray, (k, loc)) - args = tuple(map(asarray, args)) - k = asarray(k-loc) - cond0 = self._argcheck(*args) - cond1 = (k >= self.a) & (k <= self.b) - cond2 = (k < self.a) & cond0 - cond = cond0 & cond1 - output = empty(shape(cond), 'd') - output.fill(NINF) - place(output, (1-cond0) + np.isnan(k), self.badvalue) - place(output, cond2, 0.0) - if any(cond): - goodargs = argsreduce(cond, *((k,)+args)) - place(output, cond, self._logsf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def ppf(self, q, *args, **kwds): - """ - Percent point function (inverse of cdf) at q of the given RV - - Parameters - ---------- - q : array_like - Lower tail probability. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - scale : array_like, optional - Scale parameter (default=1). - - Returns - ------- - k : array_like - Quantile corresponding to the lower tail probability, q. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - q, loc = map(asarray, (q, loc)) - args = tuple(map(asarray, args)) - cond0 = self._argcheck(*args) & (loc == loc) - cond1 = (q > 0) & (q < 1) - cond2 = (q == 1) & cond0 - cond = cond0 & cond1 - output = valarray(shape(cond), value=self.badvalue, typecode='d') - # output type 'd' to handle nin and inf - place(output, (q == 0)*(cond == cond), self.a-1) - place(output, cond2, self.b) - if any(cond): - goodargs = argsreduce(cond, *((q,)+args+(loc,))) - loc, goodargs = goodargs[-1], goodargs[:-1] - place(output, cond, self._ppf(*goodargs) + loc) - - if output.ndim == 0: - return output[()] - return output - - def isf(self, q, *args, **kwds): - """ - Inverse survival function (inverse of `sf`) at q of the given RV. - - Parameters - ---------- - q : array_like - Upper tail probability. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - - Returns - ------- - k : ndarray or scalar - Quantile corresponding to the upper tail probability, q. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - q, loc = map(asarray, (q, loc)) - args = tuple(map(asarray, args)) - cond0 = self._argcheck(*args) & (loc == loc) - cond1 = (q > 0) & (q < 1) - cond2 = (q == 1) & cond0 - cond = cond0 & cond1 - - # same problem as with ppf; copied from ppf and changed - output = valarray(shape(cond), value=self.badvalue, typecode='d') - # output type 'd' to handle nin and inf - place(output, (q == 0)*(cond == cond), self.b) - place(output, cond2, self.a-1) - - # call place only if at least 1 valid argument - if any(cond): - goodargs = argsreduce(cond, *((q,)+args+(loc,))) - loc, goodargs = goodargs[-1], goodargs[:-1] - # PB same as ticket 766 - place(output, cond, self._isf(*goodargs) + loc) - - if output.ndim == 0: - return output[()] - return output - - def _entropy(self, *args): - if hasattr(self, 'pk'): - return entropy(self.pk) - else: - mu = int(self.stats(*args, **{'moments': 'm'})) - val = self.pmf(mu, *args) - ent = -xlogy(val, val) - k = 1 - term = 1.0 - while (abs(term) > _EPS): - val = self.pmf(mu+k, *args) - term = -xlogy(val, val) - val = self.pmf(mu-k, *args) - term -= xlogy(val, val) - k += 1 - ent += term - return ent - - def expect(self, func=None, args=(), loc=0, lb=None, ub=None, - conditional=False): - """ - Calculate expected value of a function with respect to the distribution - for discrete distribution - - Parameters - ---------- - fn : function (default: identity mapping) - Function for which sum is calculated. Takes only one argument. - args : tuple - argument (parameters) of the distribution - lb, ub : numbers, optional - lower and upper bound for integration, default is set to the - support of the distribution, lb and ub are inclusive (ul<=k<=ub) - conditional : bool, optional - Default is False. - If true then the expectation is corrected by the conditional - probability of the integration interval. The return value is the - expectation of the function, conditional on being in the given - interval (k such that ul<=k<=ub). - - Returns - ------- - expect : float - Expected value. - - Notes - ----- - * function is not vectorized - * accuracy: uses self.moment_tol as stopping criterium - for heavy tailed distribution e.g. zipf(4), accuracy for - mean, variance in example is only 1e-5, - increasing precision (moment_tol) makes zipf very slow - * suppnmin=100 internal parameter for minimum number of points to - evaluate could be added as keyword parameter, to evaluate functions - with non-monotonic shapes, points include integers in (-suppnmin, - suppnmin) - * uses maxcount=1000 limits the number of points that are evaluated - to break loop for infinite sums - (a maximum of suppnmin+1000 positive plus suppnmin+1000 negative - integers are evaluated) - - """ - - # moment_tol = 1e-12 # increase compared to self.moment_tol, - # too slow for only small gain in precision for zipf - - # avoid endless loop with unbound integral, eg. var of zipf(2) - maxcount = 1000 - suppnmin = 100 # minimum number of points to evaluate (+ and -) - - if func is None: - def fun(x): - # loc and args from outer scope - return (x+loc)*self._pmf(x, *args) - else: - def fun(x): - # loc and args from outer scope - return func(x+loc)*self._pmf(x, *args) - # used pmf because _pmf does not check support in randint and there - # might be problems(?) with correct self.a, self.b at this stage maybe - # not anymore, seems to work now with _pmf - - self._argcheck(*args) # (re)generate scalar self.a and self.b - if lb is None: - lb = (self.a) - else: - lb = lb - loc # convert bound for standardized distribution - if ub is None: - ub = (self.b) - else: - ub = ub - loc # convert bound for standardized distribution - if conditional: - if np.isposinf(ub)[()]: - # work around bug: stats.poisson.sf(stats.poisson.b, 2) is nan - invfac = 1 - self.cdf(lb-1, *args) - else: - invfac = 1 - self.cdf(lb-1, *args) - self.sf(ub, *args) - else: - invfac = 1.0 - - #tot = 0.0 - low, upp = self._ppf(0.001, *args), self._ppf(0.999, *args) - low = max(min(-suppnmin, low), lb) - upp = min(max(suppnmin, upp), ub) - supp = np.arange(low, upp+1, self.inc) # check limits - tot = np.sum(fun(supp)) - diff = 1e100 - pos = upp + self.inc - count = 0 - - # handle cases with infinite support - - while (pos <= ub) and (diff > self.moment_tol) and count <= maxcount: - diff = fun(pos) - tot += diff - pos += self.inc - count += 1 - - if self.a < 0: # handle case when self.a = -inf - diff = 1e100 - pos = low - self.inc - while ((pos >= lb) and (diff > self.moment_tol) and - count <= maxcount): - diff = fun(pos) - tot += diff - pos -= self.inc - count += 1 - if count > maxcount: - warnings.warn('expect(): sum did not converge', RuntimeWarning) - return tot/invfac - - -def get_distribution_names(namespace_pairs, rv_base_class): - """ - Collect names of statistical distributions and their generators. +def fit2(self, data, *args, **kwds): + ''' Return Maximum Likelihood or Maximum Product Spacing estimator object Parameters ---------- - namespace_pairs : sequence - A snapshot of (name, value) pairs in the namespace of a module. - rv_base_class : class - The base class of random variable generator classes in a module. + data : array-like + Data to use in calculating the ML or MPS estimators + args : optional + Starting values for any shape arguments (those not specified + will be determined by dist._fitstart(data)) + kwds : loc, scale + Starting values for the location and scale parameters + Special keyword arguments are recognized as holding certain + parameters fixed: + f0..fn : hold respective shape paramters fixed + floc : hold location parameter fixed to specified value + fscale : hold scale parameter fixed to specified value + method : of estimation. Options are + 'ml' : Maximum Likelihood method (default) + 'mps': Maximum Product Spacing method + alpha : scalar, optional + Confidence coefficent (default=0.05) + search : bool + If true search for best estimator (default), + otherwise return object with initial distribution parameters + copydata : bool + If true copydata (default) + optimizer : The optimizer to use. The optimizer must take func, + and starting position as the first two arguments, + plus args (for extra arguments to pass to the + function to be optimized) and disp=0 to suppress + output as keyword arguments. + + Return + ------ + phat : FitDistribution object + Fitted distribution object with following member variables: + LLmax : loglikelihood function evaluated using par + LPSmax : log product spacing function evaluated using par + pvalue : p-value for the fit + par : distribution parameters (fixed and fitted) + par_cov : covariance of distribution parameters + par_fix : fixed distribution parameters + par_lower : lower (1-alpha)% confidence bound for the parameters + par_upper : upper (1-alpha)% confidence bound for the parameters + + Note + ---- + `data` is sorted using this function, so if `copydata`==False the data + in your namespace will be sorted as well. + ''' + return FitDistribution(self, data, *args, **kwds) - Returns - ------- - distn_names : list of strings - Names of the statistical distributions. - distn_gen_names : list of strings - Names of the generators of the statistical distributions. - Note that these are not simply the names of the statistical - distributions, with a _gen suffix added. - """ - distn_names = [] - distn_gen_names = [] - for name, value in namespace_pairs: - if name.startswith('_'): - continue - if name.endswith('_gen') and issubclass(value, rv_base_class): - distn_gen_names.append(name) - if isinstance(value, rv_base_class): - distn_names.append(name) - return distn_names, distn_gen_names +rv_generic.freeze = freeze +rv_discrete.freeze = freeze +rv_continuous.freeze = freeze +rv_continuous.link = link +rv_continuous._link = _link +rv_continuous.nlogps = nlogps +rv_continuous._reduce_func = _reduce_func +rv_continuous.fit = fit +rv_continuous.fit2 = fit2 + diff --git a/wafo/stats/_distr_params.py b/wafo/stats/_distr_params.py index ee19707..ec65bdd 100644 --- a/wafo/stats/_distr_params.py +++ b/wafo/stats/_distr_params.py @@ -10,6 +10,7 @@ distcont = [ ['betaprime', (5, 6)], ['bradford', (0.29891359763170633,)], ['burr', (10.5, 4.3)], + ['burr12', (10, 4)], ['cauchy', ()], ['chi', (78,)], ['chi2', (55,)], @@ -18,6 +19,7 @@ distcont = [ ['dweibull', (2.0685080649914673,)], ['erlang', (10,)], ['expon', ()], + ['exponnorm', (1.5,)], ['exponpow', (2.697119160358469,)], ['exponweib', (2.8923945291034436, 1.9505288745913174)], ['f', (29, 18)], @@ -33,8 +35,11 @@ distcont = [ ['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)], ['genextreme', (-0.1,)], ['gengamma', (4.4162385429431925, 3.1193091679242761)], + ['gengamma', (4.4162385429431925, -3.1193091679242761)], ['genhalflogistic', (0.77274727809929322,)], ['genlogistic', (0.41192440799679475,)], + ['gennorm', (1.2988442399460265,)], + ['halfgennorm', (0.6748054997000371,)], ['genpareto', (0.1,)], # use case with finite moments ['gilbrat', ()], ['gompertz', (0.94743713075105251,)], @@ -80,6 +85,7 @@ distcont = [ ['reciprocal', (0.0062309367010521255, 1.0062309367010522)], ['rice', (0.7749725210111873,)], ['semicircular', ()], + ['skewnorm', (4.0,)], ['t', (2.7433514990818093,)], ['triang', (0.15785029824528218,)], ['truncexpon', (4.6907725456810478,)], @@ -113,4 +119,3 @@ distdiscrete = [ ['skellam', (15, 8)], ['zipf', (6.5,)] ] - diff --git a/wafo/stats/_multivariate.py b/wafo/stats/_multivariate.py deleted file mode 100644 index 5455a61..0000000 --- a/wafo/stats/_multivariate.py +++ /dev/null @@ -1,884 +0,0 @@ -# -# Author: Joris Vankerschaver 2013 -# -from __future__ import division, print_function, absolute_import - -import numpy as np -import scipy.linalg -from scipy.misc import doccer -from scipy.special import gammaln - - -__all__ = ['multivariate_normal', 'dirichlet'] - -_LOG_2PI = np.log(2 * np.pi) - - -def _process_parameters(dim, mean, cov): - """ - Infer dimensionality from mean or covariance matrix, ensure that - mean and covariance are full vector resp. matrix. - - """ - - # Try to infer dimensionality - if dim is None: - if mean is None: - if cov is None: - dim = 1 - else: - cov = np.asarray(cov, dtype=float) - if cov.ndim < 2: - dim = 1 - else: - dim = cov.shape[0] - else: - mean = np.asarray(mean, dtype=float) - dim = mean.size - else: - if not np.isscalar(dim): - raise ValueError("Dimension of random variable must be a scalar.") - - # Check input sizes and return full arrays for mean and cov if necessary - if mean is None: - mean = np.zeros(dim) - mean = np.asarray(mean, dtype=float) - - if cov is None: - cov = 1.0 - cov = np.asarray(cov, dtype=float) - - if dim == 1: - mean.shape = (1,) - cov.shape = (1, 1) - - if mean.ndim != 1 or mean.shape[0] != dim: - raise ValueError("Array 'mean' must be a vector of length %d." % dim) - if cov.ndim == 0: - cov = cov * np.eye(dim) - elif cov.ndim == 1: - cov = np.diag(cov) - elif cov.ndim == 2 and cov.shape != (dim, dim): - rows, cols = cov.shape - if rows != cols: - msg = ("Array 'cov' must be square if it is two dimensional," - " but cov.shape = %s." % str(cov.shape)) - else: - msg = ("Dimension mismatch: array 'cov' is of shape %s," - " but 'mean' is a vector of length %d.") - msg = msg % (str(cov.shape), len(mean)) - raise ValueError(msg) - elif cov.ndim > 2: - raise ValueError("Array 'cov' must be at most two-dimensional," - " but cov.ndim = %d" % cov.ndim) - - return dim, mean, cov - - -def _process_quantiles(x, dim): - """ - Adjust quantiles array so that last axis labels the components of - each data point. - - """ - x = np.asarray(x, dtype=float) - - if x.ndim == 0: - x = x[np.newaxis] - elif x.ndim == 1: - if dim == 1: - x = x[:, np.newaxis] - else: - x = x[np.newaxis, :] - - return x - - -def _squeeze_output(out): - """ - Remove single-dimensional entries from array and convert to scalar, - if necessary. - - """ - out = out.squeeze() - if out.ndim == 0: - out = out[()] - return out - - -def _eigvalsh_to_eps(spectrum, cond=None, rcond=None): - """ - Determine which eigenvalues are "small" given the spectrum. - - This is for compatibility across various linear algebra functions - that should agree about whether or not a Hermitian matrix is numerically - singular and what is its numerical matrix rank. - This is designed to be compatible with scipy.linalg.pinvh. - - Parameters - ---------- - spectrum : 1d ndarray - Array of eigenvalues of a Hermitian matrix. - cond, rcond : float, optional - Cutoff for small eigenvalues. - Singular values smaller than rcond * largest_eigenvalue are - considered zero. - If None or -1, suitable machine precision is used. - - Returns - ------- - eps : float - Magnitude cutoff for numerical negligibility. - - """ - if rcond is not None: - cond = rcond - if cond in [None, -1]: - t = spectrum.dtype.char.lower() - factor = {'f': 1E3, 'd': 1E6} - cond = factor[t] * np.finfo(t).eps - eps = cond * np.max(abs(spectrum)) - return eps - - -def _pinv_1d(v, eps=1e-5): - """ - A helper function for computing the pseudoinverse. - - Parameters - ---------- - v : iterable of numbers - This may be thought of as a vector of eigenvalues or singular values. - eps : float - Values with magnitude no greater than eps are considered negligible. - - Returns - ------- - v_pinv : 1d float ndarray - A vector of pseudo-inverted numbers. - - """ - return np.array([0 if abs(x) <= eps else 1/x for x in v], dtype=float) - - -class _PSD(object): - """ - Compute coordinated functions of a symmetric positive semidefinite matrix. - - This class addresses two issues. Firstly it allows the pseudoinverse, - the logarithm of the pseudo-determinant, and the rank of the matrix - to be computed using one call to eigh instead of three. - Secondly it allows these functions to be computed in a way - that gives mutually compatible results. - All of the functions are computed with a common understanding as to - which of the eigenvalues are to be considered negligibly small. - The functions are designed to coordinate with scipy.linalg.pinvh() - but not necessarily with np.linalg.det() or with np.linalg.matrix_rank(). - - Parameters - ---------- - M : 2d array-like - Symmetric positive semidefinite matrix. - cond, rcond : float, optional - Cutoff for small eigenvalues. - Singular values smaller than rcond * largest_eigenvalue are - considered zero. - If None or -1, suitable machine precision is used. - lower : bool, optional - Whether the pertinent array data is taken from the lower - or upper triangle of M. (Default: lower) - check_finite : bool, optional - Whether to check that the input matrices contain only finite - numbers. Disabling may give a performance gain, but may result - in problems (crashes, non-termination) if the inputs do contain - infinities or NaNs. - allow_singular : bool, optional - Whether to allow a singular matrix. (Default: True) - - Notes - ----- - The arguments are similar to those of scipy.linalg.pinvh(). - - """ - - def __init__(self, M, cond=None, rcond=None, lower=True, - check_finite=True, allow_singular=True): - # Compute the symmetric eigendecomposition. - # Note that eigh takes care of array conversion, chkfinite, - # and assertion that the matrix is square. - s, u = scipy.linalg.eigh(M, lower=lower, check_finite=check_finite) - - eps = _eigvalsh_to_eps(s, cond, rcond) - if np.min(s) < -eps: - raise ValueError('the input matrix must be positive semidefinite') - d = s[s > eps] - if len(d) < len(s) and not allow_singular: - raise np.linalg.LinAlgError('singular matrix') - s_pinv = _pinv_1d(s, eps) - U = np.multiply(u, np.sqrt(s_pinv)) - - # Initialize the eagerly precomputed attributes. - self.rank = len(d) - self.U = U - self.log_pdet = np.sum(np.log(d)) - - # Initialize an attribute to be lazily computed. - self._pinv = None - - @property - def pinv(self): - if self._pinv is None: - self._pinv = np.dot(self.U, self.U.T) - return self._pinv - - -_doc_default_callparams = """\ -mean : array_like, optional - Mean of the distribution (default zero) -cov : array_like, optional - Covariance matrix of the distribution (default one) -allow_singular : bool, optional - Whether to allow a singular covariance matrix. (Default: False) -""" - -_doc_callparams_note = \ - """Setting the parameter `mean` to `None` is equivalent to having `mean` - be the zero-vector. The parameter `cov` can be a scalar, in which case - the covariance matrix is the identity times that value, a vector of - diagonal entries for the covariance matrix, or a two-dimensional - array_like. - """ - -_doc_frozen_callparams = "" - -_doc_frozen_callparams_note = \ - """See class definition for a detailed description of parameters.""" - -docdict_params = { - '_doc_default_callparams': _doc_default_callparams, - '_doc_callparams_note': _doc_callparams_note -} - -docdict_noparams = { - '_doc_default_callparams': _doc_frozen_callparams, - '_doc_callparams_note': _doc_frozen_callparams_note -} - - -class multivariate_normal_gen(object): - r""" - A multivariate normal random variable. - - The `mean` keyword specifies the mean. The `cov` keyword specifies the - covariance matrix. - - Methods - ------- - pdf(x, mean=None, cov=1, allow_singular=False) - Probability density function. - logpdf(x, mean=None, cov=1, allow_singular=False) - Log of the probability density function. - rvs(mean=None, cov=1, allow_singular=False, size=1) - Draw random samples from a multivariate normal distribution. - entropy() - Compute the differential entropy of the multivariate normal. - - Parameters - ---------- - x : array_like - Quantiles, with the last axis of `x` denoting the components. - %(_doc_default_callparams)s - - Alternatively, the object may be called (as a function) to fix the mean - and covariance parameters, returning a "frozen" multivariate normal - random variable: - - rv = multivariate_normal(mean=None, cov=1, allow_singular=False) - - Frozen object with the same methods but holding the given - mean and covariance fixed. - - Notes - ----- - %(_doc_callparams_note)s - - The covariance matrix `cov` must be a (symmetric) positive - semi-definite matrix. The determinant and inverse of `cov` are computed - as the pseudo-determinant and pseudo-inverse, respectively, so - that `cov` does not need to have full rank. - - The probability density function for `multivariate_normal` is - - .. math:: - - f(x) = \frac{1}{\sqrt{(2 \pi)^k \det \Sigma}} \exp\left( -\frac{1}{2} (x - \mu)^T \Sigma^{-1} (x - \mu) \right), - - where :math:`\mu` is the mean, :math:`\Sigma` the covariance matrix, - and :math:`k` is the dimension of the space where :math:`x` takes values. - - .. versionadded:: 0.14.0 - - Examples - -------- - >>> import matplotlib.pyplot as plt - >>> from scipy.stats import multivariate_normal - >>> x = np.linspace(0, 5, 10, endpoint=False) - >>> y = multivariate_normal.pdf(x, mean=2.5, cov=0.5); y - array([ 0.00108914, 0.01033349, 0.05946514, 0.20755375, 0.43939129, - 0.56418958, 0.43939129, 0.20755375, 0.05946514, 0.01033349]) - >>> plt.plot(x, y) - - The input quantiles can be any shape of array, as long as the last - axis labels the components. This allows us for instance to - display the frozen pdf for a non-isotropic random variable in 2D as - follows: - - >>> x, y = np.mgrid[-1:1:.01, -1:1:.01] - >>> pos = np.empty(x.shape + (2,)) - >>> pos[:, :, 0] = x; pos[:, :, 1] = y - >>> rv = multivariate_normal([0.5, -0.2], [[2.0, 0.3], [0.3, 0.5]]) - >>> plt.contourf(x, y, rv.pdf(pos)) - - """ - - def __init__(self): - self.__doc__ = doccer.docformat(self.__doc__, docdict_params) - - def __call__(self, mean=None, cov=1, allow_singular=False): - """ - Create a frozen multivariate normal distribution. - - See `multivariate_normal_frozen` for more information. - - """ - return multivariate_normal_frozen(mean, cov, - allow_singular=allow_singular) - - def _logpdf(self, x, mean, prec_U, log_det_cov, rank): - """ - Parameters - ---------- - x : ndarray - Points at which to evaluate the log of the probability - density function - mean : ndarray - Mean of the distribution - prec_U : ndarray - A decomposition such that np.dot(prec_U, prec_U.T) - is the precision matrix, i.e. inverse of the covariance matrix. - log_det_cov : float - Logarithm of the determinant of the covariance matrix - rank : int - Rank of the covariance matrix. - - Notes - ----- - As this function does no argument checking, it should not be - called directly; use 'logpdf' instead. - - """ - dev = x - mean - maha = np.sum(np.square(np.dot(dev, prec_U)), axis=-1) - return -0.5 * (rank * _LOG_2PI + log_det_cov + maha) - - def logpdf(self, x, mean, cov, allow_singular=False): - """ - Log of the multivariate normal probability density function. - - Parameters - ---------- - x : array_like - Quantiles, with the last axis of `x` denoting the components. - %(_doc_default_callparams)s - - Notes - ----- - %(_doc_callparams_note)s - - Returns - ------- - pdf : ndarray - Log of the probability density function evaluated at `x` - - """ - dim, mean, cov = _process_parameters(None, mean, cov) - x = _process_quantiles(x, dim) - psd = _PSD(cov, allow_singular=allow_singular) - out = self._logpdf(x, mean, psd.U, psd.log_pdet, psd.rank) - return _squeeze_output(out) - - def pdf(self, x, mean, cov, allow_singular=False): - """ - Multivariate normal probability density function. - - Parameters - ---------- - x : array_like - Quantiles, with the last axis of `x` denoting the components. - %(_doc_default_callparams)s - - Notes - ----- - %(_doc_callparams_note)s - - Returns - ------- - pdf : ndarray - Probability density function evaluated at `x` - - """ - dim, mean, cov = _process_parameters(None, mean, cov) - x = _process_quantiles(x, dim) - psd = _PSD(cov, allow_singular=allow_singular) - out = np.exp(self._logpdf(x, mean, psd.U, psd.log_pdet, psd.rank)) - return _squeeze_output(out) - - def rvs(self, mean=None, cov=1, size=1): - """ - Draw random samples from a multivariate normal distribution. - - Parameters - ---------- - %(_doc_default_callparams)s - size : integer, optional - Number of samples to draw (default 1). - - Notes - ----- - %(_doc_callparams_note)s - - Returns - ------- - rvs : ndarray or scalar - Random variates of size (`size`, `N`), where `N` is the - dimension of the random variable. - - """ - dim, mean, cov = _process_parameters(None, mean, cov) - out = np.random.multivariate_normal(mean, cov, size) - return _squeeze_output(out) - - def entropy(self, mean=None, cov=1): - """ - Compute the differential entropy of the multivariate normal. - - Parameters - ---------- - %(_doc_default_callparams)s - - Notes - ----- - %(_doc_callparams_note)s - - Returns - ------- - h : scalar - Entropy of the multivariate normal distribution - - """ - dim, mean, cov = _process_parameters(None, mean, cov) - return 0.5 * np.log(np.linalg.det(2 * np.pi * np.e * cov)) - - -multivariate_normal = multivariate_normal_gen() - - -class multivariate_normal_frozen(object): - def __init__(self, mean=None, cov=1, allow_singular=False): - """ - Create a frozen multivariate normal distribution. - - Parameters - ---------- - mean : array_like, optional - Mean of the distribution (default zero) - cov : array_like, optional - Covariance matrix of the distribution (default one) - allow_singular : bool, optional - If this flag is True then tolerate a singular - covariance matrix (default False). - - Examples - -------- - When called with the default parameters, this will create a 1D random - variable with mean 0 and covariance 1: - - >>> from scipy.stats import multivariate_normal - >>> r = multivariate_normal() - >>> r.mean - array([ 0.]) - >>> r.cov - array([[1.]]) - - """ - self.dim, self.mean, self.cov = _process_parameters(None, mean, cov) - self.cov_info = _PSD(self.cov, allow_singular=allow_singular) - self._mnorm = multivariate_normal_gen() - - def logpdf(self, x): - x = _process_quantiles(x, self.dim) - out = self._mnorm._logpdf(x, self.mean, self.cov_info.U, - self.cov_info.log_pdet, self.cov_info.rank) - return _squeeze_output(out) - - def pdf(self, x): - return np.exp(self.logpdf(x)) - - def rvs(self, size=1): - return self._mnorm.rvs(self.mean, self.cov, size) - - def entropy(self): - """ - Computes the differential entropy of the multivariate normal. - - Returns - ------- - h : scalar - Entropy of the multivariate normal distribution - - """ - log_pdet = self.cov_info.log_pdet - rank = self.cov_info.rank - return 0.5 * (rank * (_LOG_2PI + 1) + log_pdet) - - -# Set frozen generator docstrings from corresponding docstrings in -# multivariate_normal_gen and fill in default strings in class docstrings -for name in ['logpdf', 'pdf', 'rvs']: - method = multivariate_normal_gen.__dict__[name] - method_frozen = multivariate_normal_frozen.__dict__[name] - method_frozen.__doc__ = doccer.docformat(method.__doc__, docdict_noparams) - method.__doc__ = doccer.docformat(method.__doc__, docdict_params) - -_dirichlet_doc_default_callparams = """\ -alpha : array_like - The concentration parameters. The number of entries determines the - dimensionality of the distribution. -""" -_dirichlet_doc_frozen_callparams = "" - -_dirichlet_doc_frozen_callparams_note = \ - """See class definition for a detailed description of parameters.""" - -dirichlet_docdict_params = { - '_dirichlet_doc_default_callparams': _dirichlet_doc_default_callparams, -} - -dirichlet_docdict_noparams = { - '_dirichlet_doc_default_callparams': _dirichlet_doc_frozen_callparams, -} - - -def _dirichlet_check_parameters(alpha): - alpha = np.asarray(alpha) - if np.min(alpha) <= 0: - raise ValueError("All parameters must be greater than 0") - elif alpha.ndim != 1: - raise ValueError("Parameter vector 'a' must be one dimensional, " + - "but a.shape = %s." % str(alpha.shape)) - return alpha - - -def _dirichlet_check_input(alpha, x): - x = np.asarray(x) - - if x.shape[0] + 1 != alpha.shape[0] and x.shape[0] != alpha.shape[0]: - raise ValueError("Vector 'x' must have one entry less then the" + - " parameter vector 'a', but alpha.shape = " + - "%s and " % alpha.shape + - "x.shape = %s." % x.shape) - - if x.shape[0] != alpha.shape[0]: - xk = np.array([1 - np.sum(x, 0)]) - if xk.ndim == 1: - x = np.append(x, xk) - elif xk.ndim == 2: - x = np.vstack((x, xk)) - else: - raise ValueError("The input must be one dimensional or a two " - "dimensional matrix containing the entries.") - - if np.min(x) < 0: - raise ValueError("Each entry in 'x' must be greater or equal zero.") - - if np.max(x) > 1: - raise ValueError("Each entry in 'x' must be smaller or equal one.") - - if (np.abs(np.sum(x, 0) - 1.0) > 10e-10).any(): - raise ValueError("The input vector 'x' must lie within the normal " + - "simplex. but sum(x)=%f." % np.sum(x, 0)) - - return x - - -def _lnB(alpha): - r""" - Internal helper function to compute the log of the useful quotient - - .. math:: - B(\alpha) = \frac{\prod_{i=1}{K}\Gamma(\alpha_i)}{\Gamma\left(\sum_{i=1}^{K}\alpha_i\right)} - - Parameters - ---------- - %(_dirichlet_doc_default_callparams)s - - Returns - ------- - B : scalar - Helper quotient, internal use only - - """ - return np.sum(gammaln(alpha)) - gammaln(np.sum(alpha)) - - -class dirichlet_gen(object): - r""" - A Dirichlet random variable. - - The `alpha` keyword specifies the concentration parameters of the - distribution. - - .. versionadded:: 0.15.0 - - Methods - ------- - pdf(x, alpha) - Probability density function. - logpdf(x, alpha) - Log of the probability density function. - rvs(alpha, size=1) - Draw random samples from a Dirichlet distribution. - mean(alpha) - The mean of the Dirichlet distribution - var(alpha) - The variance of the Dirichlet distribution - entropy(alpha) - Compute the differential entropy of the multivariate normal. - - Parameters - ---------- - x : array_like - Quantiles, with the last axis of `x` denoting the components. - %(_dirichlet_doc_default_callparams)s - - Alternatively, the object may be called (as a function) to fix - concentration parameters, returning a "frozen" Dirichlet - random variable: - - rv = dirichlet(alpha) - - Frozen object with the same methods but holding the given - concentration parameters fixed. - - Notes - ----- - Each :math:`\alpha` entry must be positive. The distribution has only - support on the simplex defined by - - .. math:: - \sum_{i=1}^{K} x_i \le 1 - - - The probability density function for `dirichlet` is - - .. math:: - - f(x) = \frac{1}{\mathrm{B}(\boldsymbol\alpha)} \prod_{i=1}^K x_i^{\alpha_i - 1} - - where - - .. math:: - \mathrm{B}(\boldsymbol\alpha) = \frac{\prod_{i=1}^K \Gamma(\alpha_i)}{\Gamma\bigl(\sum_{i=1}^K \alpha_i\bigr)} - - and :math:`\boldsymbol\alpha=(\alpha_1,\ldots,\alpha_K)`, the - concentration parameters and :math:`K` is the dimension of the space - where :math:`x` takes values. - - """ - - def __init__(self): - self.__doc__ = doccer.docformat(self.__doc__, dirichlet_docdict_params) - - def __call__(self, alpha): - return dirichlet_frozen(alpha) - - def _logpdf(self, x, alpha): - """ - Parameters - ---------- - x : ndarray - Points at which to evaluate the log of the probability - density function - %(_dirichlet_doc_default_callparams)s - - Notes - ----- - As this function does no argument checking, it should not be - called directly; use 'logpdf' instead. - - """ - lnB = _lnB(alpha) - return - lnB + np.sum((np.log(x.T) * (alpha - 1)).T, 0) - - def logpdf(self, x, alpha): - """ - Log of the Dirichlet probability density function. - - Parameters - ---------- - x : array_like - Quantiles, with the last axis of `x` denoting the components. - %(_dirichlet_doc_default_callparams)s - - Returns - ------- - pdf : ndarray - Log of the probability density function evaluated at `x` - """ - alpha = _dirichlet_check_parameters(alpha) - x = _dirichlet_check_input(alpha, x) - - out = self._logpdf(x, alpha) - return _squeeze_output(out) - - def pdf(self, x, alpha): - """ - The Dirichlet probability density function. - - Parameters - ---------- - x : array_like - Quantiles, with the last axis of `x` denoting the components. - %(_dirichlet_doc_default_callparams)s - - Returns - ------- - pdf : ndarray - The probability density function evaluated at `x` - """ - alpha = _dirichlet_check_parameters(alpha) - x = _dirichlet_check_input(alpha, x) - - out = np.exp(self._logpdf(x, alpha)) - return _squeeze_output(out) - - def mean(self, alpha): - """ - Compute the mean of the dirichlet distribution. - - Parameters - ---------- - %(_dirichlet_doc_default_callparams)s - - Returns - ------- - mu : scalar - Mean of the Dirichlet distribution - - """ - alpha = _dirichlet_check_parameters(alpha) - - out = alpha / (np.sum(alpha)) - return _squeeze_output(out) - - def var(self, alpha): - """ - Compute the variance of the dirichlet distribution. - - Parameters - ---------- - %(_dirichlet_doc_default_callparams)s - - Returns - ------- - v : scalar - Variance of the Dirichlet distribution - - """ - - alpha = _dirichlet_check_parameters(alpha) - - alpha0 = np.sum(alpha) - out = (alpha * (alpha0 - alpha)) / ((alpha0 * alpha0) * (alpha0 + 1)) - return out - - def entropy(self, alpha): - """ - Compute the differential entropy of the dirichlet distribution. - - Parameters - ---------- - %(_dirichlet_doc_default_callparams)s - - Returns - ------- - h : scalar - Entropy of the Dirichlet distribution - - """ - - alpha = _dirichlet_check_parameters(alpha) - - alpha0 = np.sum(alpha) - lnB = _lnB(alpha) - K = alpha.shape[0] - - out = lnB + (alpha0 - K) * scipy.special.psi(alpha0) - np.sum( - (alpha - 1) * scipy.special.psi(alpha)) - return _squeeze_output(out) - - def rvs(self, alpha, size=1): - """ - Draw random samples from a Dirichlet distribution. - - Parameters - ---------- - %(_dirichlet_doc_default_callparams)s - size : integer, optional - Number of samples to draw (default 1). - - - Returns - ------- - rvs : ndarray or scalar - Random variates of size (`size`, `N`), where `N` is the - dimension of the random variable. - - """ - alpha = _dirichlet_check_parameters(alpha) - return np.random.dirichlet(alpha, size=size) - - -dirichlet = dirichlet_gen() - - -class dirichlet_frozen(object): - def __init__(self, alpha): - self.alpha = _dirichlet_check_parameters(alpha) - self._dirichlet = dirichlet_gen() - - def logpdf(self, x): - return self._dirichlet.logpdf(x, self.alpha) - - def pdf(self, x): - return self._dirichlet.pdf(x, self.alpha) - - def mean(self): - return self._dirichlet.mean(self.alpha) - - def var(self): - return self._dirichlet.var(self.alpha) - - def entropy(self): - return self._dirichlet.entropy(self.alpha) - - def rvs(self, size=1): - return self._dirichlet.rvs(self.alpha, size) - - -# Set frozen generator docstrings from corresponding docstrings in -# multivariate_normal_gen and fill in default strings in class docstrings -for name in ['logpdf', 'pdf', 'rvs', 'mean', 'var', 'entropy']: - method = dirichlet_gen.__dict__[name] - method_frozen = dirichlet_frozen.__dict__[name] - method_frozen.__doc__ = doccer.docformat( - method.__doc__, dirichlet_docdict_noparams) - method.__doc__ = doccer.docformat(method.__doc__, dirichlet_docdict_params) diff --git a/wafo/stats/_tukeylambda_stats.py b/wafo/stats/_tukeylambda_stats.py deleted file mode 100644 index 2681814..0000000 --- a/wafo/stats/_tukeylambda_stats.py +++ /dev/null @@ -1,201 +0,0 @@ -from __future__ import division, print_function, absolute_import - -import numpy as np -from numpy import poly1d -from scipy.special import beta - - -# The following code was used to generate the Pade coefficients for the -# Tukey Lambda variance function. Version 0.17 of mpmath was used. -#--------------------------------------------------------------------------- -# import mpmath as mp -# -# mp.mp.dps = 60 -# -# one = mp.mpf(1) -# two = mp.mpf(2) -# -# def mpvar(lam): -# if lam == 0: -# v = mp.pi**2 / three -# else: -# v = (two / lam**2) * (one / (one + two*lam) - -# mp.beta(lam + one, lam + one)) -# return v -# -# t = mp.taylor(mpvar, 0, 8) -# p, q = mp.pade(t, 4, 4) -# print "p =", [mp.fp.mpf(c) for c in p] -# print "q =", [mp.fp.mpf(c) for c in q] -#--------------------------------------------------------------------------- - -# Pade coefficients for the Tukey Lambda variance function. -_tukeylambda_var_pc = [3.289868133696453, 0.7306125098871127, - -0.5370742306855439, 0.17292046290190008, - -0.02371146284628187] -_tukeylambda_var_qc = [1.0, 3.683605511659861, 4.184152498888124, - 1.7660926747377275, 0.2643989311168465] - -# numpy.poly1d instances for the numerator and denominator of the -# Pade approximation to the Tukey Lambda variance. -_tukeylambda_var_p = poly1d(_tukeylambda_var_pc[::-1]) -_tukeylambda_var_q = poly1d(_tukeylambda_var_qc[::-1]) - - -def tukeylambda_variance(lam): - """Variance of the Tukey Lambda distribution. - - Parameters - ---------- - lam : array_like - The lambda values at which to compute the variance. - - Returns - ------- - v : ndarray - The variance. For lam < -0.5, the variance is not defined, so - np.nan is returned. For lam = 0.5, np.inf is returned. - - Notes - ----- - In an interval around lambda=0, this function uses the [4,4] Pade - approximation to compute the variance. Otherwise it uses the standard - formula (http://en.wikipedia.org/wiki/Tukey_lambda_distribution). The - Pade approximation is used because the standard formula has a removable - discontinuity at lambda = 0, and does not produce accurate numerical - results near lambda = 0. - """ - lam = np.asarray(lam) - shp = lam.shape - lam = np.atleast_1d(lam).astype(np.float64) - - # For absolute values of lam less than threshold, use the Pade - # approximation. - threshold = 0.075 - - # Play games with masks to implement the conditional evaluation of - # the distribution. - # lambda < -0.5: var = nan - low_mask = lam < -0.5 - # lambda == -0.5: var = inf - neghalf_mask = lam == -0.5 - # abs(lambda) < threshold: use Pade approximation - small_mask = np.abs(lam) < threshold - # else the "regular" case: use the explicit formula. - reg_mask = ~(low_mask | neghalf_mask | small_mask) - - # Get the 'lam' values for the cases where they are needed. - small = lam[small_mask] - reg = lam[reg_mask] - - # Compute the function for each case. - v = np.empty_like(lam) - v[low_mask] = np.nan - v[neghalf_mask] = np.inf - if small.size > 0: - # Use the Pade approximation near lambda = 0. - v[small_mask] = _tukeylambda_var_p(small) / _tukeylambda_var_q(small) - if reg.size > 0: - v[reg_mask] = (2.0 / reg**2) * (1.0 / (1.0 + 2 * reg) - - beta(reg + 1, reg + 1)) - v.shape = shp - return v - - -# The following code was used to generate the Pade coefficients for the -# Tukey Lambda kurtosis function. Version 0.17 of mpmath was used. -#--------------------------------------------------------------------------- -# import mpmath as mp -# -# mp.mp.dps = 60 -# -# one = mp.mpf(1) -# two = mp.mpf(2) -# three = mp.mpf(3) -# four = mp.mpf(4) -# -# def mpkurt(lam): -# if lam == 0: -# k = mp.mpf(6)/5 -# else: -# numer = (one/(four*lam+one) - four*mp.beta(three*lam+one, lam+one) + -# three*mp.beta(two*lam+one, two*lam+one)) -# denom = two*(one/(two*lam+one) - mp.beta(lam+one,lam+one))**2 -# k = numer / denom - three -# return k -# -# # There is a bug in mpmath 0.17: when we use the 'method' keyword of the -# # taylor function and we request a degree 9 Taylor polynomial, we actually -# # get degree 8. -# t = mp.taylor(mpkurt, 0, 9, method='quad', radius=0.01) -# t = [mp.chop(c, tol=1e-15) for c in t] -# p, q = mp.pade(t, 4, 4) -# print "p =", [mp.fp.mpf(c) for c in p] -# print "q =", [mp.fp.mpf(c) for c in q] -#--------------------------------------------------------------------------- - -# Pade coefficients for the Tukey Lambda kurtosis function. -_tukeylambda_kurt_pc = [1.2, -5.853465139719495, -22.653447381131077, - 0.20601184383406815, 4.59796302262789] -_tukeylambda_kurt_qc = [1.0, 7.171149192233599, 12.96663094361842, - 0.43075235247853005, -2.789746758009912] - -# numpy.poly1d instances for the numerator and denominator of the -# Pade approximation to the Tukey Lambda kurtosis. -_tukeylambda_kurt_p = poly1d(_tukeylambda_kurt_pc[::-1]) -_tukeylambda_kurt_q = poly1d(_tukeylambda_kurt_qc[::-1]) - - -def tukeylambda_kurtosis(lam): - """Kurtosis of the Tukey Lambda distribution. - - Parameters - ---------- - lam : array_like - The lambda values at which to compute the variance. - - Returns - ------- - v : ndarray - The variance. For lam < -0.25, the variance is not defined, so - np.nan is returned. For lam = 0.25, np.inf is returned. - - """ - lam = np.asarray(lam) - shp = lam.shape - lam = np.atleast_1d(lam).astype(np.float64) - - # For absolute values of lam less than threshold, use the Pade - # approximation. - threshold = 0.055 - - # Use masks to implement the conditional evaluation of the kurtosis. - # lambda < -0.25: kurtosis = nan - low_mask = lam < -0.25 - # lambda == -0.25: kurtosis = inf - negqrtr_mask = lam == -0.25 - # lambda near 0: use Pade approximation - small_mask = np.abs(lam) < threshold - # else the "regular" case: use the explicit formula. - reg_mask = ~(low_mask | negqrtr_mask | small_mask) - - # Get the 'lam' values for the cases where they are needed. - small = lam[small_mask] - reg = lam[reg_mask] - - # Compute the function for each case. - k = np.empty_like(lam) - k[low_mask] = np.nan - k[negqrtr_mask] = np.inf - if small.size > 0: - k[small_mask] = _tukeylambda_kurt_p(small) / _tukeylambda_kurt_q(small) - if reg.size > 0: - numer = (1.0 / (4 * reg + 1) - 4 * beta(3 * reg + 1, reg + 1) + - 3 * beta(2 * reg + 1, 2 * reg + 1)) - denom = 2 * (1.0/(2 * reg + 1) - beta(reg + 1, reg + 1))**2 - k[reg_mask] = numer / denom - 3 - - # The return value will be a numpy array; resetting the shape ensures that - # if `lam` was a scalar, the return value is a 0-d array. - k.shape = shp - return k diff --git a/wafo/stats/contingency.py b/wafo/stats/contingency.py deleted file mode 100644 index c67306c..0000000 --- a/wafo/stats/contingency.py +++ /dev/null @@ -1,271 +0,0 @@ -"""Some functions for working with contingency tables (i.e. cross tabulations). -""" - - -from __future__ import division, print_function, absolute_import - -from functools import reduce -import numpy as np -from .stats import power_divergence - - -__all__ = ['margins', 'expected_freq', 'chi2_contingency'] - - -def margins(a): - """Return a list of the marginal sums of the array `a`. - - Parameters - ---------- - a : ndarray - The array for which to compute the marginal sums. - - Returns - ------- - margsums : list of ndarrays - A list of length `a.ndim`. `margsums[k]` is the result - of summing `a` over all axes except `k`; it has the same - number of dimensions as `a`, but the length of each axis - except axis `k` will be 1. - - Examples - -------- - >>> a = np.arange(12).reshape(2, 6) - >>> a - array([[ 0, 1, 2, 3, 4, 5], - [ 6, 7, 8, 9, 10, 11]]) - >>> m0, m1 = margins(a) - >>> m0 - array([[15], - [51]]) - >>> m1 - array([[ 6, 8, 10, 12, 14, 16]]) - - >>> b = np.arange(24).reshape(2,3,4) - >>> m0, m1, m2 = margins(b) - >>> m0 - array([[[ 66]], - [[210]]]) - >>> m1 - array([[[ 60], - [ 92], - [124]]]) - >>> m2 - array([[[60, 66, 72, 78]]]) - """ - margsums = [] - ranged = list(range(a.ndim)) - for k in ranged: - marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k]) - margsums.append(marg) - return margsums - - -def expected_freq(observed): - """ - Compute the expected frequencies from a contingency table. - - Given an n-dimensional contingency table of observed frequencies, - compute the expected frequencies for the table based on the marginal - sums under the assumption that the groups associated with each - dimension are independent. - - Parameters - ---------- - observed : array_like - The table of observed frequencies. (While this function can handle - a 1-D array, that case is trivial. Generally `observed` is at - least 2-D.) - - Returns - ------- - expected : ndarray of float64 - The expected frequencies, based on the marginal sums of the table. - Same shape as `observed`. - - Examples - -------- - >>> observed = np.array([[10, 10, 20],[20, 20, 20]]) - >>> expected_freq(observed) - array([[ 12., 12., 16.], - [ 18., 18., 24.]]) - - """ - # Typically `observed` is an integer array. If `observed` has a large - # number of dimensions or holds large values, some of the following - # computations may overflow, so we first switch to floating point. - observed = np.asarray(observed, dtype=np.float64) - - # Create a list of the marginal sums. - margsums = margins(observed) - - # Create the array of expected frequencies. The shapes of the - # marginal sums returned by apply_over_axes() are just what we - # need for broadcasting in the following product. - d = observed.ndim - expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1) - return expected - - -def chi2_contingency(observed, correction=True, lambda_=None): - """Chi-square test of independence of variables in a contingency table. - - This function computes the chi-square statistic and p-value for the - hypothesis test of independence of the observed frequencies in the - contingency table [1]_ `observed`. The expected frequencies are computed - based on the marginal sums under the assumption of independence; see - `scipy.stats.contingency.expected_freq`. The number of degrees of - freedom is (expressed using numpy functions and attributes):: - - dof = observed.size - sum(observed.shape) + observed.ndim - 1 - - - Parameters - ---------- - observed : array_like - The contingency table. The table contains the observed frequencies - (i.e. number of occurrences) in each category. In the two-dimensional - case, the table is often described as an "R x C table". - correction : bool, optional - If True, *and* the degrees of freedom is 1, apply Yates' correction - for continuity. The effect of the correction is to adjust each - observed value by 0.5 towards the corresponding expected value. - lambda_ : float or str, optional. - By default, the statistic computed in this test is Pearson's - chi-squared statistic [2]_. `lambda_` allows a statistic from the - Cressie-Read power divergence family [3]_ to be used instead. See - `power_divergence` for details. - - Returns - ------- - chi2 : float - The test statistic. - p : float - The p-value of the test - dof : int - Degrees of freedom - expected : ndarray, same shape as `observed` - The expected frequencies, based on the marginal sums of the table. - - See Also - -------- - contingency.expected_freq - fisher_exact - chisquare - power_divergence - - Notes - ----- - An often quoted guideline for the validity of this calculation is that - the test should be used only if the observed and expected frequency in - each cell is at least 5. - - This is a test for the independence of different categories of a - population. The test is only meaningful when the dimension of - `observed` is two or more. Applying the test to a one-dimensional - table will always result in `expected` equal to `observed` and a - chi-square statistic equal to 0. - - This function does not handle masked arrays, because the calculation - does not make sense with missing values. - - Like stats.chisquare, this function computes a chi-square statistic; - the convenience this function provides is to figure out the expected - frequencies and degrees of freedom from the given contingency table. - If these were already known, and if the Yates' correction was not - required, one could use stats.chisquare. That is, if one calls:: - - chi2, p, dof, ex = chi2_contingency(obs, correction=False) - - then the following is true:: - - (chi2, p) == stats.chisquare(obs.ravel(), f_exp=ex.ravel(), - ddof=obs.size - 1 - dof) - - The `lambda_` argument was added in version 0.13.0 of scipy. - - References - ---------- - .. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table - .. [2] "Pearson's chi-squared test", - http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test - .. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit - Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984), - pp. 440-464. - - Examples - -------- - A two-way example (2 x 3): - - >>> obs = np.array([[10, 10, 20], [20, 20, 20]]) - >>> chi2_contingency(obs) - (2.7777777777777777, - 0.24935220877729619, - 2, - array([[ 12., 12., 16.], - [ 18., 18., 24.]])) - - Perform the test using the log-likelihood ratio (i.e. the "G-test") - instead of Pearson's chi-squared statistic. - - >>> g, p, dof, expctd = chi2_contingency(obs, lambda_="log-likelihood") - >>> g, p - (2.7688587616781319, 0.25046668010954165) - - A four-way example (2 x 2 x 2 x 2): - - >>> obs = np.array( - ... [[[[12, 17], - ... [11, 16]], - ... [[11, 12], - ... [15, 16]]], - ... [[[23, 15], - ... [30, 22]], - ... [[14, 17], - ... [15, 16]]]]) - >>> chi2_contingency(obs) - (8.7584514426741897, - 0.64417725029295503, - 11, - array([[[[ 14.15462386, 14.15462386], - [ 16.49423111, 16.49423111]], - [[ 11.2461395 , 11.2461395 ], - [ 13.10500554, 13.10500554]]], - [[[ 19.5591166 , 19.5591166 ], - [ 22.79202844, 22.79202844]], - [[ 15.54012004, 15.54012004], - [ 18.10873492, 18.10873492]]]])) - """ - observed = np.asarray(observed) - if np.any(observed < 0): - raise ValueError("All values in `observed` must be nonnegative.") - if observed.size == 0: - raise ValueError("No data; `observed` has size 0.") - - expected = expected_freq(observed) - if np.any(expected == 0): - # Include one of the positions where expected is zero in - # the exception message. - zeropos = list(zip(*np.where(expected == 0)))[0] - raise ValueError("The internally computed table of expected " - "frequencies has a zero element at %s." % (zeropos,)) - - # The degrees of freedom - dof = expected.size - sum(expected.shape) + expected.ndim - 1 - - if dof == 0: - # Degenerate case; this occurs when `observed` is 1D (or, more - # generally, when it has only one nontrivial dimension). In this - # case, we also have observed == expected, so chi2 is 0. - chi2 = 0.0 - p = 1.0 - else: - if dof == 1 and correction: - # Adjust `observed` according to Yates' correction for continuity. - observed = observed + 0.5 * np.sign(expected - observed) - - chi2, p = power_divergence(observed, expected, - ddof=observed.size - 1 - dof, axis=None, - lambda_=lambda_) - - return chi2, p, dof, expected diff --git a/wafo/stats/distributions.py b/wafo/stats/distributions.py index ecf8942..c26fd65 100644 --- a/wafo/stats/distributions.py +++ b/wafo/stats/distributions.py @@ -22,4 +22,3 @@ __all__ = ['entropy', 'rv_discrete', 'rv_continuous'] # Add only the distribution names, not the *_gen names. __all__ += _continuous_distns._distn_names __all__ += _discrete_distns._distn_names - diff --git a/wafo/stats/estimation.py b/wafo/stats/estimation.py index 6ee4f64..3fd5371 100644 --- a/wafo/stats/estimation.py +++ b/wafo/stats/estimation.py @@ -11,20 +11,19 @@ from __future__ import division, absolute_import import warnings from wafo.plotbackend import plotbackend -from wafo.misc import ecross, findcross - - -import numdifftools # @UnresolvedImport +from wafo.misc import ecross, findcross, argsreduce +from wafo.stats._util import check_random_state +from wafo.stats._constants import _EPS, _XMAX +from wafo.stats._distn_infrastructure import rv_frozen +from scipy._lib.six import string_types +import numdifftools as nd # @UnresolvedImport from scipy import special from scipy.linalg import pinv2 from scipy import optimize -import numpy import numpy as np -from numpy import alltrue, arange, ravel, sum, zeros, log, sqrt, exp -from numpy import ( - atleast_1d, any, asarray, nan, pi, # reshape, #repeat, product, ndarray, - isfinite) +from numpy import (alltrue, arange, ravel, zeros, log, sqrt, exp, + atleast_1d, any, asarray, nan, pi, isfinite) from numpy import flatnonzero as nonzero @@ -48,97 +47,6 @@ def norm_ppf(q): return special.ndtri(q) -# Frozen RV class -class rv_frozen(object): - - ''' Frozen continous or discrete 1D Random Variable object (RV) - - Methods - ------- - rvs(size=1) - Random variates. - pdf(x) - Probability density function. - cdf(x) - Cumulative density function. - sf(x) - Survival function (1-cdf --- sometimes more accurate). - ppf(q) - Percent point function (inverse of cdf --- percentiles). - isf(q) - Inverse survival function (inverse of sf). - stats(moments='mv') - Mean('m'), variance('v'), skew('s'), and/or kurtosis('k'). - entropy() - (Differential) entropy of the RV. - ''' - - def __init__(self, dist, *args, **kwds): - self.dist = dist - args, loc, scale = dist._parse_args(*args, **kwds) - if len(args) == dist.numargs - 2: # - # if isinstance(dist, rv_continuous): - self.par = args + (loc, scale) - else: # rv_discrete - self.par = args + (loc,) - - def pdf(self, x): - ''' Probability density function at x of the given RV.''' - return self.dist.pdf(x, *self.par) - - def cdf(self, x): - '''Cumulative distribution function at x of the given RV.''' - return self.dist.cdf(x, *self.par) - - def ppf(self, q): - '''Percent point function (inverse of cdf) at q of the given RV.''' - return self.dist.ppf(q, *self.par) - - def isf(self, q): - '''Inverse survival function at q of the given RV.''' - return self.dist.isf(q, *self.par) - - def rvs(self, size=None): - '''Random variates of given type.''' - kwds = dict(size=size) - return self.dist.rvs(*self.par, **kwds) - - def sf(self, x): - '''Survival function (1-cdf) at x of the given RV.''' - return self.dist.sf(x, *self.par) - - def stats(self, moments='mv'): - ''' Some statistics of the given RV''' - kwds = dict(moments=moments) - return self.dist.stats(*self.par, **kwds) - - def median(self): - return self.dist.median(*self.par) - - def mean(self): - return self.dist.mean(*self.par) - - def var(self): - return self.dist.var(*self.par) - - def std(self): - return self.dist.std(*self.par) - - def moment(self, n): - par1 = self.par[:self.dist.numargs] - return self.dist.moment(n, *par1) - - def entropy(self): - return self.dist.entropy(*self.par) - - def pmf(self, k): - '''Probability mass function at k of the given RV''' - return self.dist.pmf(k, *self.par) - - def interval(self, alpha): - return self.dist.interval(alpha, *self.par) - - # internal class to profile parameters of a given distribution class Profile(object): @@ -230,7 +138,7 @@ class Profile(object): def __init__(self, fit_dist, **kwds): try: - i0 = (1 - numpy.isfinite(fit_dist.par_fix)).argmax() + i0 = (1 - np.isfinite(fit_dist.par_fix)).argmax() except: i0 = 0 self.fit_dist = fit_dist @@ -259,7 +167,7 @@ class Profile(object): if fit_dist.par_fix is None: isnotfixed = np.ones(fit_dist.par.shape, dtype=bool) else: - isnotfixed = 1 - numpy.isfinite(fit_dist.par_fix) + isnotfixed = 1 - np.isfinite(fit_dist.par_fix) self.i_notfixed = nonzero(isnotfixed) @@ -341,7 +249,7 @@ class Profile(object): def _set_profile(self, phatfree0, p_opt): pvec = self._get_pvec(phatfree0, p_opt) - self.data = numpy.ones_like(pvec) * nan + self.data = np.ones_like(pvec) * nan k1 = (pvec >= p_opt).argmax() for size, step in ((-1, -1), (pvec.size, 1)): @@ -358,14 +266,14 @@ class Profile(object): def _prettify_profile(self): pvec = self.args - ix = nonzero(numpy.isfinite(pvec)) + ix = nonzero(np.isfinite(pvec)) self.data = self.data[ix] self.args = pvec[ix] - cond = self.data == -numpy.inf + cond = self.data == -np.inf if any(cond): ind, = cond.nonzero() self.data.put(ind, floatinfo.min / 2.0) - ind1 = numpy.where(ind == 0, ind, ind - 1) + ind1 = np.where(ind == 0, ind, ind - 1) cl = self.alpha_cross_level - self.alpha_Lrange / 2.0 t0 = ecross(self.args, self.data, ind1, cl) self.data.put(ind, cl) @@ -379,29 +287,29 @@ class Profile(object): phatv = self._par if self.profile_x: - gradfun = numdifftools.Gradient(self._myinvfun) + gradfun = nd.Gradient(self._myinvfun) else: - gradfun = numdifftools.Gradient(self._myprbfun) + gradfun = nd.Gradient(self._myprbfun) drl = gradfun(phatv[self.i_notfixed]) pcov = self.fit_dist.par_cov[i_notfixed, :][:, i_notfixed] - pvar = sum(numpy.dot(drl, pcov) * drl) + pvar = np.sum(np.dot(drl, pcov) * drl) return pvar def _get_pvec(self, phatfree0, p_opt): ''' return proper interval for the variable to profile ''' - linspace = numpy.linspace + linspace = np.linspace if self.pmin is None or self.pmax is None: pvar = self._get_variance() - if pvar <= 1e-5 or numpy.isnan(pvar): + if pvar <= 1e-5 or np.isnan(pvar): pvar = max(abs(p_opt) * 0.5, 0.5) p_crit = (-norm_ppf(self.alpha / 2.0) * - sqrt(numpy.ravel(pvar)) * 1.5) + sqrt(np.ravel(pvar)) * 1.5) if self.pmin is None: self.pmin = self._search_pmin(phatfree0, p_opt - 5.0 * p_crit, p_opt) @@ -412,13 +320,13 @@ class Profile(object): p_opt + 5.0 * p_crit, p_opt) p_crit_up = (self.pmax - p_opt) / 5 - N4 = numpy.floor(self.N / 4.0) + N4 = np.floor(self.N / 4.0) pvec1 = linspace(self.pmin, p_opt - p_crit_low, N4 + 1) pvec2 = linspace( p_opt - p_crit_low, p_opt + p_crit_up, self.N - 2 * N4) pvec3 = linspace(p_opt + p_crit_up, self.pmax, N4 + 1) - pvec = numpy.unique(numpy.hstack((pvec1, p_opt, pvec2, pvec3))) + pvec = np.unique(np.hstack((pvec1, p_opt, pvec2, pvec3))) else: pvec = linspace(self.pmin, self.pmax, self.N) @@ -701,12 +609,12 @@ class FitDistribution(rv_frozen): m_variables = ['method', 'alpha', 'par_fix', 'search', 'copydata'] m_defaults = ['ml', 0.05, None, True, True] for (name, val) in zip(m_variables, m_defaults): - setattr(self, name, kwds.get(name, val)) + setattr(self, name, kwds.pop(name, val)) if self.method.lower()[:].startswith('mps'): - self._fitfun = dist.nlogps + self._fitfun = self._nlogps else: - self._fitfun = dist.nnlf + self._fitfun = self._nnlf self.data = ravel(data) if self.copydata: @@ -714,6 +622,7 @@ class FitDistribution(rv_frozen): self.data.sort() par, fixedn = self._fit(*args, **kwds) + # super(FitDistribution, self).__init__(dist, *par) self.par = arr(par) somefixed = len(fixedn) > 0 if somefixed: @@ -729,13 +638,13 @@ class FitDistribution(rv_frozen): self._compute_cov() # Set confidence interval for parameters - pvar = numpy.diag(self.par_cov) + pvar = np.diag(self.par_cov) zcrit = -norm_ppf(self.alpha / 2.0) self.par_lower = self.par - zcrit * sqrt(pvar) self.par_upper = self.par + zcrit * sqrt(pvar) - self.LLmax = -dist.nnlf(self.par, self.data) - self.LPSmax = -dist.nlogps(self.par, self.data) + self.LLmax = -self._nnlf(self.par, self.data) + self.LPSmax = -self._nlogps(self.par, self.data) self.pvalue = self._pvalue(self.par, self.data, unknown_numpar=numpar) def __repr__(self): @@ -747,17 +656,30 @@ class FitDistribution(rv_frozen): return ''.join(t) def _reduce_func(self, args, kwds): + # First of all, convert fshapes params to fnum: eg for stats.beta, + # shapes='a, b'. To fix `a`, can specify either `f1` or `fa`. + # Convert the latter into the former. + if self.shapes: + shapes = self.shapes.replace(',', ' ').split() + for j, s in enumerate(shapes): + val = kwds.pop('f' + s, None) or kwds.pop('fix_' + s, None) + if val is not None: + key = 'f%d' % j + if key in kwds: + raise ValueError("Duplicate entry for %s." % key) + else: + kwds[key] = val args = list(args) Nargs = len(args) fixedn = [] - index = range(Nargs) names = ['f%d' % n for n in range(Nargs - 2)] + ['floc', 'fscale'] - x0 = args[:] - for n, key in zip(index[::-1], names[::-1]): + x0 = [] + for n, key in enumerate(names): if key in kwds: fixedn.append(n) - args[n] = kwds[key] - del x0[n] + args[n] = kwds.pop(key) + else: + x0.append(args[n]) fitfun = self._fitfun @@ -765,7 +687,7 @@ class FitDistribution(rv_frozen): func = fitfun restore = None else: - if len(fixedn) == len(index): + if len(fixedn) == Nargs: raise ValueError("All parameters fixed. " + "There is nothing to optimize.") @@ -786,6 +708,134 @@ class FitDistribution(rv_frozen): return x0, func, restore, args, fixedn + @staticmethod + def _hessian(nnlf, theta, data, eps=None): + ''' approximate hessian of nnlf where theta are the parameters + (including loc and scale) + ''' + if eps is None: + eps = (_EPS) ** 0.4 + num_par = len(theta) + # pab 07.01.2001: Always choose the stepsize h so that + # it is an exactly representable number. + # This is important when calculating numerical derivatives and is + # accomplished by the following. + delta = (eps + 2.0) - 2.0 + delta2 = delta ** 2.0 + # Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with + # 1/(d^2 L(theta|x)/dtheta^2) + # using central differences + + LL = nnlf(theta, data) + H = zeros((num_par, num_par)) # Hessian matrix + theta = tuple(theta) + for ix in xrange(num_par): + sparam = list(theta) + sparam[ix] = theta[ix] + delta + fp = nnlf(sparam, data) + + sparam[ix] = theta[ix] - delta + fm = nnlf(sparam, data) + + H[ix, ix] = (fp - 2 * LL + fm) / delta2 + for iy in range(ix + 1, num_par): + sparam[ix] = theta[ix] + delta + sparam[iy] = theta[iy] + delta + fpp = nnlf(sparam, data) + + sparam[iy] = theta[iy] - delta + fpm = nnlf(sparam, data) + + sparam[ix] = theta[ix] - delta + fmm = nnlf(sparam, data) + + sparam[iy] = theta[iy] + delta + fmp = nnlf(sparam, data) + + H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2) + H[iy, ix] = H[ix, iy] + sparam[iy] = theta[iy] + return -H + + def _nnlf(self, theta, x): + return self.dist._penalized_nnlf(theta, x) + + def _nlogps(self, theta, x): + """ Moran's negative log Product Spacings statistic + + where theta are the parameters (including loc and scale) + + Note the data in x must be sorted + + References + ----------- + + R. C. H. Cheng; N. A. K. Amin (1983) + "Estimating Parameters in Continuous Univariate Distributions with a + Shifted Origin.", + Journal of the Royal Statistical Society. Series B (Methodological), + Vol. 45, No. 3. (1983), pp. 394-403. + + R. C. H. Cheng; M. A. Stephens (1989) + "A Goodness-Of-Fit Test Using Moran's Statistic with Estimated + Parameters", Biometrika, 76, 2, pp 385-392 + + Wong, T.S.T. and Li, W.K. (2006) + "A note on the estimation of extreme value distributions using maximum + product of spacings.", + IMS Lecture Notes Monograph Series 2006, Vol. 52, pp. 272-283 + """ + n = 2 if self._rv_continous else 1 + try: + loc = theta[-n] + scale = theta[-1] + args = tuple(theta[:-n]) + except IndexError: + raise ValueError("Not enough input arguments.") + if not self._rv_continous: + scale = 1 + if not self._argcheck(*args) or scale <= 0: + return np.inf + dist = self.dist + x = asarray((x - loc) / scale) + cond0 = (x <= dist.a) | (dist.b <= x) + Nbad = np.sum(cond0) + if Nbad > 0: + x = argsreduce(~cond0, x)[0] + + lowertail = True + if lowertail: + prb = np.hstack((0.0, dist.cdf(x, *args), 1.0)) + dprb = np.diff(prb) + else: + prb = np.hstack((1.0, dist.sf(x, *args), 0.0)) + dprb = -np.diff(prb) + + logD = log(dprb) + dx = np.diff(x, axis=0) + tie = (dx == 0) + if any(tie): + # TODO : implement this method for treating ties in data: + # Assume measuring error is delta. Then compute + # yL = F(xi-delta,theta) + # yU = F(xi+delta,theta) + # and replace + # logDj = log((yU-yL)/(r-1)) for j = i+1,i+2,...i+r-1 + + # The following is OK when only minimization of T is wanted + i_tie, = np.nonzero(tie) + tiedata = x[i_tie] + logD[i_tie + 1] = log(dist._pdf(tiedata, *args)) - log(scale) + + finiteD = np.isfinite(logD) + nonfiniteD = 1 - finiteD + Nbad += np.sum(nonfiniteD, axis=0) + if Nbad > 0: + T = -np.sum(logD[finiteD], axis=0) + 100.0 * log(_XMAX) * Nbad + else: + T = -np.sum(logD, axis=0) + return T + def _fit(self, *args, **kwds): dist = self.dist @@ -799,15 +849,14 @@ class FitDistribution(rv_frozen): # get distribution specific starting locations start = dist._fitstart(data) args += start[Narg:-2] - loc = kwds.get('loc', start[-2]) - scale = kwds.get('scale', start[-1]) + loc = kwds.pop('loc', start[-2]) + scale = kwds.pop('scale', start[-1]) args += (loc, scale) x0, func, restore, args, fixedn = self._reduce_func(args, kwds) if self.search: - optimizer = kwds.get('optimizer', optimize.fmin) + optimizer = kwds.pop('optimizer', optimize.fmin) # convert string to function in scipy.optimize - if (not callable(optimizer) and - isinstance(optimizer, (str, unicode))): + if not callable(optimizer) and isinstance(optimizer, string_types): if not optimizer.startswith('fmin_'): optimizer = "fmin_" + optimizer if optimizer == 'fmin_': @@ -816,7 +865,9 @@ class FitDistribution(rv_frozen): optimizer = getattr(optimize, optimizer) except AttributeError: raise ValueError("%s is not a valid optimizer" % optimizer) - + # by now kwds must be empty, since everybody took what they needed + if kwds: + raise TypeError("Unknown arguments: %s." % kwds) vals = optimizer(func, x0, args=(ravel(data),), disp=0) vals = tuple(vals) else: @@ -829,8 +880,7 @@ class FitDistribution(rv_frozen): '''Compute covariance ''' somefixed = (self.par_fix is not None) and any(isfinite(self.par_fix)) - # H1 = numpy.asmatrix(self.dist.hessian_nnlf(self.par, self.data)) - H = numpy.asmatrix(self.dist.hessian_nlogps(self.par, self.data)) + H = np.asmatrix(self._hessian(self._fitfun, self.par, self.data)) self.H = H try: if somefixed: @@ -1034,7 +1084,7 @@ class FitDistribution(rv_frozen): # yy[0,0] = 0.0 # pdf yy[:, 0] = 0.0 # histogram yy.shape = (-1,) - yy = numpy.hstack((yy, 0.0)) + yy = np.hstack((yy, 0.0)) return xx, yy def _get_empirical_pdf(self): @@ -1110,7 +1160,7 @@ class FitDistribution(rv_frozen): Note: the data in x must be sorted ''' - dx = numpy.diff(x, axis=0) + dx = np.diff(x, axis=0) tie = (dx == 0) if any(tie): warnings.warn( diff --git a/wafo/stats/kde.py b/wafo/stats/kde.py deleted file mode 100644 index 85ddbf1..0000000 --- a/wafo/stats/kde.py +++ /dev/null @@ -1,541 +0,0 @@ -#------------------------------------------------------------------------------- -# -# Define classes for (uni/multi)-variate kernel density estimation. -# -# Currently, only Gaussian kernels are implemented. -# -# Written by: Robert Kern -# -# Date: 2004-08-09 -# -# Modified: 2005-02-10 by Robert Kern. -# Contributed to Scipy -# 2005-10-07 by Robert Kern. -# Some fixes to match the new scipy_core -# -# Copyright 2004-2005 by Enthought, Inc. -# -#------------------------------------------------------------------------------- - -from __future__ import division, print_function, absolute_import - -# Standard library imports. -import warnings - -# Scipy imports. -from scipy._lib.six import callable, string_types -from scipy import linalg, special - -from numpy import atleast_2d, reshape, zeros, newaxis, dot, exp, pi, sqrt, \ - ravel, power, atleast_1d, squeeze, sum, transpose -import numpy as np -from numpy.random import randint, multivariate_normal - -# Local imports. -from . import mvn - - -__all__ = ['gaussian_kde'] - - -class gaussian_kde(object): - """Representation of a kernel-density estimate using Gaussian kernels. - - Kernel density estimation is a way to estimate the probability density - function (PDF) of a random variable in a non-parametric way. - `gaussian_kde` works for both uni-variate and multi-variate data. It - includes automatic bandwidth determination. The estimation works best for - a unimodal distribution; bimodal or multi-modal distributions tend to be - oversmoothed. - - Parameters - ---------- - dataset : array_like - Datapoints to estimate from. In case of univariate data this is a 1-D - array, otherwise a 2-D array with shape (# of dims, # of data). - bw_method : str, scalar or callable, optional - The method used to calculate the estimator bandwidth. This can be - 'scott', 'silverman', a scalar constant or a callable. If a scalar, - this will be used directly as `kde.factor`. If a callable, it should - take a `gaussian_kde` instance as only parameter and return a scalar. - If None (default), 'scott' is used. See Notes for more details. - - Attributes - ---------- - dataset : ndarray - The dataset with which `gaussian_kde` was initialized. - d : int - Number of dimensions. - n : int - Number of datapoints. - factor : float - The bandwidth factor, obtained from `kde.covariance_factor`, with which - the covariance matrix is multiplied. - covariance : ndarray - The covariance matrix of `dataset`, scaled by the calculated bandwidth - (`kde.factor`). - inv_cov : ndarray - The inverse of `covariance`. - - Methods - ------- - kde.evaluate(points) : ndarray - Evaluate the estimated pdf on a provided set of points. - kde(points) : ndarray - Same as kde.evaluate(points) - kde.integrate_gaussian(mean, cov) : float - Multiply pdf with a specified Gaussian and integrate over the whole - domain. - kde.integrate_box_1d(low, high) : float - Integrate pdf (1D only) between two bounds. - kde.integrate_box(low_bounds, high_bounds) : float - Integrate pdf over a rectangular space between low_bounds and - high_bounds. - kde.integrate_kde(other_kde) : float - Integrate two kernel density estimates multiplied together. - kde.pdf(points) : ndarray - Alias for ``kde.evaluate(points)``. - kde.logpdf(points) : ndarray - Equivalent to ``np.log(kde.evaluate(points))``. - kde.resample(size=None) : ndarray - Randomly sample a dataset from the estimated pdf. - kde.set_bandwidth(bw_method='scott') : None - Computes the bandwidth, i.e. the coefficient that multiplies the data - covariance matrix to obtain the kernel covariance matrix. - .. versionadded:: 0.11.0 - kde.covariance_factor : float - Computes the coefficient (`kde.factor`) that multiplies the data - covariance matrix to obtain the kernel covariance matrix. - The default is `scotts_factor`. A subclass can overwrite this method - to provide a different method, or set it through a call to - `kde.set_bandwidth`. - - Notes - ----- - Bandwidth selection strongly influences the estimate obtained from the KDE - (much more so than the actual shape of the kernel). Bandwidth selection - can be done by a "rule of thumb", by cross-validation, by "plug-in - methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde` - uses a rule of thumb, the default is Scott's Rule. - - Scott's Rule [1]_, implemented as `scotts_factor`, is:: - - n**(-1./(d+4)), - - with ``n`` the number of data points and ``d`` the number of dimensions. - Silverman's Rule [2]_, implemented as `silverman_factor`, is:: - - (n * (d + 2) / 4.)**(-1. / (d + 4)). - - Good general descriptions of kernel density estimation can be found in [1]_ - and [2]_, the mathematics for this multi-dimensional implementation can be - found in [1]_. - - References - ---------- - .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and - Visualization", John Wiley & Sons, New York, Chicester, 1992. - .. [2] B.W. Silverman, "Density Estimation for Statistics and Data - Analysis", Vol. 26, Monographs on Statistics and Applied Probability, - Chapman and Hall, London, 1986. - .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A - Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993. - .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel - conditional density estimation", Computational Statistics & Data - Analysis, Vol. 36, pp. 279-298, 2001. - - Examples - -------- - Generate some random two-dimensional data: - - >>> from scipy import stats - >>> def measure(n): - >>> "Measurement model, return two coupled measurements." - >>> m1 = np.random.normal(size=n) - >>> m2 = np.random.normal(scale=0.5, size=n) - >>> return m1+m2, m1-m2 - - >>> m1, m2 = measure(2000) - >>> xmin = m1.min() - >>> xmax = m1.max() - >>> ymin = m2.min() - >>> ymax = m2.max() - - Perform a kernel density estimate on the data: - - >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] - >>> positions = np.vstack([X.ravel(), Y.ravel()]) - >>> values = np.vstack([m1, m2]) - >>> kernel = stats.gaussian_kde(values) - >>> Z = np.reshape(kernel(positions).T, X.shape) - - Plot the results: - - >>> import matplotlib.pyplot as plt - >>> fig = plt.figure() - >>> ax = fig.add_subplot(111) - >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r, - ... extent=[xmin, xmax, ymin, ymax]) - >>> ax.plot(m1, m2, 'k.', markersize=2) - >>> ax.set_xlim([xmin, xmax]) - >>> ax.set_ylim([ymin, ymax]) - >>> plt.show() - - """ - def __init__(self, dataset, bw_method=None): - self.dataset = atleast_2d(dataset) - if not self.dataset.size > 1: - raise ValueError("`dataset` input should have multiple elements.") - - self.d, self.n = self.dataset.shape - self.set_bandwidth(bw_method=bw_method) - - def evaluate(self, points): - """Evaluate the estimated pdf on a set of points. - - Parameters - ---------- - points : (# of dimensions, # of points)-array - Alternatively, a (# of dimensions,) vector can be passed in and - treated as a single point. - - Returns - ------- - values : (# of points,)-array - The values at each point. - - Raises - ------ - ValueError : if the dimensionality of the input points is different than - the dimensionality of the KDE. - - """ - points = atleast_2d(points) - - d, m = points.shape - if d != self.d: - if d == 1 and m == self.d: - # points was passed in as a row vector - points = reshape(points, (self.d, 1)) - m = 1 - else: - msg = "points have dimension %s, dataset has dimension %s" % (d, - self.d) - raise ValueError(msg) - - result = zeros((m,), dtype=np.float) - - if m >= self.n: - # there are more points than data, so loop over data - for i in range(self.n): - diff = self.dataset[:, i, newaxis] - points - tdiff = dot(self.inv_cov, diff) - energy = sum(diff*tdiff,axis=0) / 2.0 - result = result + exp(-energy) - else: - # loop over points - for i in range(m): - diff = self.dataset - points[:, i, newaxis] - tdiff = dot(self.inv_cov, diff) - energy = sum(diff * tdiff, axis=0) / 2.0 - result[i] = sum(exp(-energy), axis=0) - - result = result / self._norm_factor - - return result - - __call__ = evaluate - - def integrate_gaussian(self, mean, cov): - """ - Multiply estimated density by a multivariate Gaussian and integrate - over the whole space. - - Parameters - ---------- - mean : aray_like - A 1-D array, specifying the mean of the Gaussian. - cov : array_like - A 2-D array, specifying the covariance matrix of the Gaussian. - - Returns - ------- - result : scalar - The value of the integral. - - Raises - ------ - ValueError : - If the mean or covariance of the input Gaussian differs from - the KDE's dimensionality. - - """ - mean = atleast_1d(squeeze(mean)) - cov = atleast_2d(cov) - - if mean.shape != (self.d,): - raise ValueError("mean does not have dimension %s" % self.d) - if cov.shape != (self.d, self.d): - raise ValueError("covariance does not have dimension %s" % self.d) - - # make mean a column vector - mean = mean[:, newaxis] - - sum_cov = self.covariance + cov - - diff = self.dataset - mean - tdiff = dot(linalg.inv(sum_cov), diff) - - energies = sum(diff * tdiff, axis=0) / 2.0 - result = sum(exp(-energies), axis=0) / sqrt(linalg.det(2 * pi * - sum_cov)) / self.n - - return result - - def integrate_box_1d(self, low, high): - """ - Computes the integral of a 1D pdf between two bounds. - - Parameters - ---------- - low : scalar - Lower bound of integration. - high : scalar - Upper bound of integration. - - Returns - ------- - value : scalar - The result of the integral. - - Raises - ------ - ValueError - If the KDE is over more than one dimension. - - """ - if self.d != 1: - raise ValueError("integrate_box_1d() only handles 1D pdfs") - - stdev = ravel(sqrt(self.covariance))[0] - - normalized_low = ravel((low - self.dataset) / stdev) - normalized_high = ravel((high - self.dataset) / stdev) - - value = np.mean(special.ndtr(normalized_high) - - special.ndtr(normalized_low)) - return value - - def integrate_box(self, low_bounds, high_bounds, maxpts=None): - """Computes the integral of a pdf over a rectangular interval. - - Parameters - ---------- - low_bounds : array_like - A 1-D array containing the lower bounds of integration. - high_bounds : array_like - A 1-D array containing the upper bounds of integration. - maxpts : int, optional - The maximum number of points to use for integration. - - Returns - ------- - value : scalar - The result of the integral. - - """ - if maxpts is not None: - extra_kwds = {'maxpts': maxpts} - else: - extra_kwds = {} - - value, inform = mvn.mvnun(low_bounds, high_bounds, self.dataset, - self.covariance, **extra_kwds) - if inform: - msg = ('An integral in mvn.mvnun requires more points than %s' % - (self.d * 1000)) - warnings.warn(msg) - - return value - - def integrate_kde(self, other): - """ - Computes the integral of the product of this kernel density estimate - with another. - - Parameters - ---------- - other : gaussian_kde instance - The other kde. - - Returns - ------- - value : scalar - The result of the integral. - - Raises - ------ - ValueError - If the KDEs have different dimensionality. - - """ - if other.d != self.d: - raise ValueError("KDEs are not the same dimensionality") - - # we want to iterate over the smallest number of points - if other.n < self.n: - small = other - large = self - else: - small = self - large = other - - sum_cov = small.covariance + large.covariance - sum_cov_chol = linalg.cho_factor(sum_cov) - result = 0.0 - for i in range(small.n): - mean = small.dataset[:, i, newaxis] - diff = large.dataset - mean - tdiff = linalg.cho_solve(sum_cov_chol, diff) - - energies = sum(diff * tdiff, axis=0) / 2.0 - result += sum(exp(-energies), axis=0) - - result /= sqrt(linalg.det(2 * pi * sum_cov)) * large.n * small.n - - return result - - def resample(self, size=None): - """ - Randomly sample a dataset from the estimated pdf. - - Parameters - ---------- - size : int, optional - The number of samples to draw. If not provided, then the size is - the same as the underlying dataset. - - Returns - ------- - resample : (self.d, `size`) ndarray - The sampled dataset. - - """ - if size is None: - size = self.n - - norm = transpose(multivariate_normal(zeros((self.d,), float), - self.covariance, size=size)) - indices = randint(0, self.n, size=size) - means = self.dataset[:, indices] - - return means + norm - - def scotts_factor(self): - return power(self.n, -1./(self.d+4)) - - def silverman_factor(self): - return power(self.n*(self.d+2.0)/4.0, -1./(self.d+4)) - - # Default method to calculate bandwidth, can be overwritten by subclass - covariance_factor = scotts_factor - - def set_bandwidth(self, bw_method=None): - """Compute the estimator bandwidth with given method. - - The new bandwidth calculated after a call to `set_bandwidth` is used - for subsequent evaluations of the estimated density. - - Parameters - ---------- - bw_method : str, scalar or callable, optional - The method used to calculate the estimator bandwidth. This can be - 'scott', 'silverman', a scalar constant or a callable. If a - scalar, this will be used directly as `kde.factor`. If a callable, - it should take a `gaussian_kde` instance as only parameter and - return a scalar. If None (default), nothing happens; the current - `kde.covariance_factor` method is kept. - - Notes - ----- - .. versionadded:: 0.11 - - Examples - -------- - >>> x1 = np.array([-7, -5, 1, 4, 5.]) - >>> kde = stats.gaussian_kde(x1) - >>> xs = np.linspace(-10, 10, num=50) - >>> y1 = kde(xs) - >>> kde.set_bandwidth(bw_method='silverman') - >>> y2 = kde(xs) - >>> kde.set_bandwidth(bw_method=kde.factor / 3.) - >>> y3 = kde(xs) - - >>> fig = plt.figure() - >>> ax = fig.add_subplot(111) - >>> ax.plot(x1, np.ones(x1.shape) / (4. * x1.size), 'bo', - ... label='Data points (rescaled)') - >>> ax.plot(xs, y1, label='Scott (default)') - >>> ax.plot(xs, y2, label='Silverman') - >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)') - >>> ax.legend() - >>> plt.show() - - """ - if bw_method is None: - pass - elif bw_method == 'scott': - self.covariance_factor = self.scotts_factor - elif bw_method == 'silverman': - self.covariance_factor = self.silverman_factor - elif np.isscalar(bw_method) and not isinstance(bw_method, string_types): - self._bw_method = 'use constant' - self.covariance_factor = lambda: bw_method - elif callable(bw_method): - self._bw_method = bw_method - self.covariance_factor = lambda: self._bw_method(self) - else: - msg = "`bw_method` should be 'scott', 'silverman', a scalar " \ - "or a callable." - raise ValueError(msg) - - self._compute_covariance() - - def _compute_covariance(self): - """Computes the covariance matrix for each Gaussian kernel using - covariance_factor(). - """ - self.factor = self.covariance_factor() - # Cache covariance and inverse covariance of the data - if not hasattr(self, '_data_inv_cov'): - self._data_covariance = atleast_2d(np.cov(self.dataset, rowvar=1, - bias=False)) - self._data_inv_cov = linalg.inv(self._data_covariance) - - self.covariance = self._data_covariance * self.factor**2 - self.inv_cov = self._data_inv_cov / self.factor**2 - self._norm_factor = sqrt(linalg.det(2*pi*self.covariance)) * self.n - - def pdf(self, x): - """ - Evaluate the estimated pdf on a provided set of points. - - Notes - ----- - This is an alias for `gaussian_kde.evaluate`. See the ``evaluate`` - docstring for more details. - - """ - return self.evaluate(x) - - def logpdf(self, x): - """ - Evaluate the log of the estimated pdf on a provided set of points. - - Notes - ----- - See `gaussian_kde.evaluate` for more details; this method simply - returns ``np.log(gaussian_kde.evaluate(x))``. - - """ - return np.log(self.evaluate(x)) diff --git a/wafo/stats/kde_example.py b/wafo/stats/kde_example.py deleted file mode 100644 index 0e9139c..0000000 --- a/wafo/stats/kde_example.py +++ /dev/null @@ -1,15 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Tue Dec 06 16:02:47 2011 - -@author: pab -""" -import numpy as np -import wafo.kdetools as wk -n = 100 -x = np.sort(5*np.random.rand(1,n)-2.5, axis=-1).ravel() -y = (np.cos(x)>2*np.random.rand(n, 1)-1).ravel() - -kreg = wk.KRegression(x,y) -f = kreg(output='plotobj', title='Kernel regression', plotflag=1) -f.plot() \ No newline at end of file diff --git a/wafo/stats/misc.py b/wafo/stats/misc.py deleted file mode 100644 index 0cb9108..0000000 --- a/wafo/stats/misc.py +++ /dev/null @@ -1,13 +0,0 @@ -from numpy import asarray, ndarray, ones, nan #, reshape, repeat, product - -def valarray(shape, value=nan, typecode=None): - """Return an array of all value. - """ - #out = reshape(repeat([value],product(shape,axis=0),axis=0),shape) - out = ones(shape, dtype=bool) * value - if typecode is not None: - out = out.astype(typecode) - if not isinstance(out, ndarray): - out = asarray(out) - return out - diff --git a/wafo/stats/morestats.py b/wafo/stats/morestats.py deleted file mode 100644 index f5521a5..0000000 --- a/wafo/stats/morestats.py +++ /dev/null @@ -1,2377 +0,0 @@ -# Author: Travis Oliphant, 2002 -# -# Further updates and enhancements by many SciPy developers. -# -from __future__ import division, print_function, absolute_import - -import math -import warnings - -import numpy as np -from numpy import (isscalar, r_, log, sum, around, unique, asarray, - zeros, arange, sort, amin, amax, any, atleast_1d, sqrt, ceil, - floor, array, poly1d, compress, not_equal, pi, exp, ravel, angle) -from numpy.testing.decorators import setastest - -from scipy._lib.six import string_types -from ._numpy_compat import count_nonzero -from scipy import optimize -from scipy import special -from . import statlib -from . import stats -from .stats import find_repeats -from .contingency import chi2_contingency -from . import distributions -from ._distn_infrastructure import rv_generic - - -__all__ = ['mvsdist', - 'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot', - 'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot', - 'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test', - 'fligner', 'mood', 'wilcoxon', 'median_test', - 'pdf_fromgamma', 'circmean', 'circvar', 'circstd', 'anderson_ksamp' - ] - - -def bayes_mvs(data, alpha=0.90): - """ - Bayesian confidence intervals for the mean, var, and std. - - Parameters - ---------- - data : array_like - Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`. - Requires 2 or more data points. - alpha : float, optional - Probability that the returned confidence interval contains - the true parameter. - - Returns - ------- - mean_cntr, var_cntr, std_cntr : tuple - The three results are for the mean, variance and standard deviation, - respectively. Each result is a tuple of the form:: - - (center, (lower, upper)) - - with `center` the mean of the conditional pdf of the value given the - data, and `(lower, upper)` a confidence interval, centered on the - median, containing the estimate to a probability `alpha`. - - Notes - ----- - Each tuple of mean, variance, and standard deviation estimates represent - the (center, (lower, upper)) with center the mean of the conditional pdf - of the value given the data and (lower, upper) is a confidence interval - centered on the median, containing the estimate to a probability - `alpha`. - - Converts data to 1-D and assumes all data has the same mean and variance. - Uses Jeffrey's prior for variance and std. - - Equivalent to tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat)) - - References - ---------- - T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and - standard-deviation from data", http://hdl.handle.net/1877/438, 2006. - - """ - res = mvsdist(data) - if alpha >= 1 or alpha <= 0: - raise ValueError("0 < alpha < 1 is required, but alpha=%s was given." % alpha) - return tuple((x.mean(), x.interval(alpha)) for x in res) - - -def mvsdist(data): - """ - 'Frozen' distributions for mean, variance, and standard deviation of data. - - Parameters - ---------- - data : array_like - Input array. Converted to 1-D using ravel. - Requires 2 or more data-points. - - Returns - ------- - mdist : "frozen" distribution object - Distribution object representing the mean of the data - vdist : "frozen" distribution object - Distribution object representing the variance of the data - sdist : "frozen" distribution object - Distribution object representing the standard deviation of the data - - Notes - ----- - The return values from bayes_mvs(data) is equivalent to - ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``. - - In other words, calling ``.mean()`` and ``.interval(0.90)`` - on the three distribution objects returned from this function will give - the same results that are returned from `bayes_mvs`. - - Examples - -------- - >>> from scipy.stats import mvsdist - >>> data = [6, 9, 12, 7, 8, 8, 13] - >>> mean, var, std = mvsdist(data) - - We now have frozen distribution objects "mean", "var" and "std" that we can - examine: - - >>> mean.mean() - 9.0 - >>> mean.interval(0.95) - (6.6120585482655692, 11.387941451734431) - >>> mean.std() - 1.1952286093343936 - - """ - x = ravel(data) - n = len(x) - if (n < 2): - raise ValueError("Need at least 2 data-points.") - xbar = x.mean() - C = x.var() - if (n > 1000): # gaussian approximations for large n - mdist = distributions.norm(loc=xbar, scale=math.sqrt(C/n)) - sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C/(2.*n))) - vdist = distributions.norm(loc=C, scale=math.sqrt(2.0/n)*C) - else: - nm1 = n-1 - fac = n*C/2. - val = nm1/2. - mdist = distributions.t(nm1,loc=xbar,scale=math.sqrt(C/nm1)) - sdist = distributions.gengamma(val,-2,scale=math.sqrt(fac)) - vdist = distributions.invgamma(val,scale=fac) - return mdist, vdist, sdist - - -def kstat(data,n=2): - """ - Return the nth k-statistic (1<=n<=4 so far). - - The nth k-statistic is the unique symmetric unbiased estimator of the nth - cumulant kappa_n. - - Parameters - ---------- - data : array_like - Input array. - n : int, {1, 2, 3, 4}, optional - Default is equal to 2. - - Returns - ------- - kstat : float - The nth k-statistic. - - See Also - -------- - kstatvar: Returns an unbiased estimator of the variance of the k-statistic. - - Notes - ----- - The cumulants are related to central moments but are specifically defined - using a power series expansion of the logarithm of the characteristic - function (which is the Fourier transform of the PDF). - In particular let phi(t) be the characteristic function, then:: - - ln phi(t) = > kappa_n (it)^n / n! (sum from n=0 to inf) - - The first few cumulants (kappa_n) in terms of central moments (mu_n) are:: - - kappa_1 = mu_1 - kappa_2 = mu_2 - kappa_3 = mu_3 - kappa_4 = mu_4 - 3*mu_2**2 - kappa_5 = mu_5 - 10*mu_2 * mu_3 - - References - ---------- - http://mathworld.wolfram.com/k-Statistic.html - - http://mathworld.wolfram.com/Cumulant.html - - """ - if n > 4 or n < 1: - raise ValueError("k-statistics only supported for 1<=n<=4") - n = int(n) - S = zeros(n+1,'d') - data = ravel(data) - N = len(data) - for k in range(1,n+1): - S[k] = sum(data**k,axis=0) - if n == 1: - return S[1]*1.0/N - elif n == 2: - return (N*S[2]-S[1]**2.0)/(N*(N-1.0)) - elif n == 3: - return (2*S[1]**3 - 3*N*S[1]*S[2]+N*N*S[3]) / (N*(N-1.0)*(N-2.0)) - elif n == 4: - return (-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 - - 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) / \ - (N*(N-1.0)*(N-2.0)*(N-3.0)) - else: - raise ValueError("Should not be here.") - - -def kstatvar(data,n=2): - """ - Returns an unbiased estimator of the variance of the k-statistic. - - See `kstat` for more details of the k-statistic. - - Parameters - ---------- - data : array_like - Input array. - n : int, {1, 2}, optional - Default is equal to 2. - - Returns - ------- - kstatvar : float - The nth k-statistic variance. - - See Also - -------- - kstat - - """ - data = ravel(data) - N = len(data) - if n == 1: - return kstat(data,n=2)*1.0/N - elif n == 2: - k2 = kstat(data,n=2) - k4 = kstat(data,n=4) - return (2*k2*k2*N + (N-1)*k4)/(N*(N+1)) - else: - raise ValueError("Only n=1 or n=2 supported.") - - -def _calc_uniform_order_statistic_medians(x): - """See Notes section of `probplot` for details.""" - N = len(x) - osm_uniform = np.zeros(N, dtype=np.float64) - osm_uniform[-1] = 0.5**(1.0 / N) - osm_uniform[0] = 1 - osm_uniform[-1] - i = np.arange(2, N) - osm_uniform[1:-1] = (i - 0.3175) / (N + 0.365) - return osm_uniform - - -def _parse_dist_kw(dist, enforce_subclass=True): - """Parse `dist` keyword. - - Parameters - ---------- - dist : str or stats.distributions instance. - Several functions take `dist` as a keyword, hence this utility - function. - enforce_subclass : bool, optional - If True (default), `dist` needs to be a - `_distn_infrastructure.rv_generic` instance. - It can sometimes be useful to set this keyword to False, if a function - wants to accept objects that just look somewhat like such an instance - (for example, they have a ``ppf`` method). - - """ - if isinstance(dist, rv_generic): - pass - elif isinstance(dist, string_types): - try: - dist = getattr(distributions, dist) - except AttributeError: - raise ValueError("%s is not a valid distribution name" % dist) - elif enforce_subclass: - msg = ("`dist` should be a stats.distributions instance or a string " - "with the name of such a distribution.") - raise ValueError(msg) - - return dist - - -def probplot(x, sparams=(), dist='norm', fit=True, plot=None): - """ - Calculate quantiles for a probability plot, and optionally show the plot. - - Generates a probability plot of sample data against the quantiles of a - specified theoretical distribution (the normal distribution by default). - `probplot` optionally calculates a best-fit line for the data and plots the - results using Matplotlib or a given plot function. - - Parameters - ---------- - x : array_like - Sample/response data from which `probplot` creates the plot. - sparams : tuple, optional - Distribution-specific shape parameters (shape parameters plus location - and scale). - dist : str or stats.distributions instance, optional - Distribution or distribution function name. The default is 'norm' for a - normal probability plot. Objects that look enough like a - stats.distributions instance (i.e. they have a ``ppf`` method) are also - accepted. - fit : bool, optional - Fit a least-squares regression (best-fit) line to the sample data if - True (default). - plot : object, optional - If given, plots the quantiles and least squares fit. - `plot` is an object that has to have methods "plot" and "text". - The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, - or a custom object with the same methods. - Default is None, which means that no plot is created. - - Returns - ------- - (osm, osr) : tuple of ndarrays - Tuple of theoretical quantiles (osm, or order statistic medians) and - ordered responses (osr). `osr` is simply sorted input `x`. - For details on how `osm` is calculated see the Notes section. - (slope, intercept, r) : tuple of floats, optional - Tuple containing the result of the least-squares fit, if that is - performed by `probplot`. `r` is the square root of the coefficient of - determination. If ``fit=False`` and ``plot=None``, this tuple is not - returned. - - Notes - ----- - Even if `plot` is given, the figure is not shown or saved by `probplot`; - ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after - calling `probplot`. - - `probplot` generates a probability plot, which should not be confused with - a Q-Q or a P-P plot. Statsmodels has more extensive functionality of this - type, see ``statsmodels.api.ProbPlot``. - - The formula used for the theoretical quantiles (horizontal axis of the - probability plot) is Filliben's estimate:: - - quantiles = dist.ppf(val), for - - 0.5**(1/n), for i = n - val = (i - 0.3175) / (n + 0.365), for i = 2, ..., n-1 - 1 - 0.5**(1/n), for i = 1 - - where ``i`` indicates the i-th ordered value and ``n`` is the total number - of values. - - Examples - -------- - >>> from scipy import stats - >>> import matplotlib.pyplot as plt - >>> nsample = 100 - >>> np.random.seed(7654321) - - A t distribution with small degrees of freedom: - - >>> ax1 = plt.subplot(221) - >>> x = stats.t.rvs(3, size=nsample) - >>> res = stats.probplot(x, plot=plt) - - A t distribution with larger degrees of freedom: - - >>> ax2 = plt.subplot(222) - >>> x = stats.t.rvs(25, size=nsample) - >>> res = stats.probplot(x, plot=plt) - - A mixture of two normal distributions with broadcasting: - - >>> ax3 = plt.subplot(223) - >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5], - ... size=(nsample/2.,2)).ravel() - >>> res = stats.probplot(x, plot=plt) - - A standard normal distribution: - - >>> ax4 = plt.subplot(224) - >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample) - >>> res = stats.probplot(x, plot=plt) - - Produce a new figure with a loggamma distribution, using the ``dist`` and - ``sparams`` keywords: - - >>> fig = plt.figure() - >>> ax = fig.add_subplot(111) - >>> x = stats.loggamma.rvs(c=2.5, size=500) - >>> stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax) - >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5") - - Show the results with Matplotlib: - - >>> plt.show() - - """ - x = np.asarray(x) - osm_uniform = _calc_uniform_order_statistic_medians(x) - dist = _parse_dist_kw(dist, enforce_subclass=False) - if sparams is None: - sparams = () - if isscalar(sparams): - sparams = (sparams,) - if not isinstance(sparams, tuple): - sparams = tuple(sparams) - - osm = dist.ppf(osm_uniform, *sparams) - osr = sort(x) - if fit or (plot is not None): - # perform a linear fit. - slope, intercept, r, prob, sterrest = stats.linregress(osm, osr) - - if plot is not None: - plot.plot(osm, osr, 'bo', osm, slope*osm + intercept, 'r-') - try: - if hasattr(plot, 'set_title'): - # Matplotlib Axes instance or something that looks like it - plot.set_title('Probability Plot') - plot.set_xlabel('Quantiles') - plot.set_ylabel('Ordered Values') - else: - # matplotlib.pyplot module - plot.title('Probability Plot') - plot.xlabel('Quantiles') - plot.ylabel('Ordered Values') - except: - # Not an MPL object or something that looks (enough) like it. - # Don't crash on adding labels or title - pass - - # Add R^2 value to the plot as text - xmin = amin(osm) - xmax = amax(osm) - ymin = amin(x) - ymax = amax(x) - posx = xmin + 0.70 * (xmax - xmin) - posy = ymin + 0.01 * (ymax - ymin) - plot.text(posx, posy, "$R^2=%1.4f$" % r) - - if fit: - return (osm, osr), (slope, intercept, r) - else: - return osm, osr - - -def ppcc_max(x, brack=(0.0,1.0), dist='tukeylambda'): - """Returns the shape parameter that maximizes the probability plot - correlation coefficient for the given data to a one-parameter - family of distributions. - - See also ppcc_plot - """ - dist = _parse_dist_kw(dist) - osm_uniform = _calc_uniform_order_statistic_medians(x) - osr = sort(x) - - # this function computes the x-axis values of the probability plot - # and computes a linear regression (including the correlation) - # and returns 1-r so that a minimization function maximizes the - # correlation - def tempfunc(shape, mi, yvals, func): - xvals = func(mi, shape) - r, prob = stats.pearsonr(xvals, yvals) - return 1-r - - return optimize.brent(tempfunc, brack=brack, args=(osm_uniform, osr, dist.ppf)) - - -def ppcc_plot(x,a,b,dist='tukeylambda', plot=None, N=80): - """Returns (shape, ppcc), and optionally plots shape vs. ppcc - (probability plot correlation coefficient) as a function of shape - parameter for a one-parameter family of distributions from shape - value a to b. - - See also ppcc_max - """ - svals = r_[a:b:complex(N)] - ppcc = svals*0.0 - k = 0 - for sval in svals: - r1,r2 = probplot(x,sval,dist=dist,fit=1) - ppcc[k] = r2[-1] - k += 1 - if plot is not None: - plot.plot(svals, ppcc, 'x') - plot.title('(%s) PPCC Plot' % dist) - plot.xlabel('Prob Plot Corr. Coef.') - plot.ylabel('Shape Values') - return svals, ppcc - - -def boxcox_llf(lmb, data): - r"""The boxcox log-likelihood function. - - Parameters - ---------- - lmb : scalar - Parameter for Box-Cox transformation. See `boxcox` for details. - data : array_like - Data to calculate Box-Cox log-likelihood for. If `data` is - multi-dimensional, the log-likelihood is calculated along the first - axis. - - Returns - ------- - llf : float or ndarray - Box-Cox log-likelihood of `data` given `lmb`. A float for 1-D `data`, - an array otherwise. - - See Also - -------- - boxcox, probplot, boxcox_normplot, boxcox_normmax - - Notes - ----- - The Box-Cox log-likelihood function is defined here as - - .. math:: - - llf = (\lambda - 1) \sum_i(\log(x_i)) - - N/2 \log(\sum_i (y_i - \bar{y})^2 / N), - - where ``y`` is the Box-Cox transformed input data ``x``. - - Examples - -------- - >>> from scipy import stats - >>> import matplotlib.pyplot as plt - >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes - >>> np.random.seed(1245) - - Generate some random variates and calculate Box-Cox log-likelihood values - for them for a range of ``lmbda`` values: - - >>> x = stats.loggamma.rvs(5, loc=10, size=1000) - >>> lmbdas = np.linspace(-2, 10) - >>> llf = np.zeros(lmbdas.shape, dtype=np.float) - >>> for ii, lmbda in enumerate(lmbdas): - ... llf[ii] = stats.boxcox_llf(lmbda, x) - - Also find the optimal lmbda value with `boxcox`: - - >>> x_most_normal, lmbda_optimal = stats.boxcox(x) - - Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a - horizontal line to check that that's really the optimum: - - >>> fig = plt.figure() - >>> ax = fig.add_subplot(111) - >>> ax.plot(lmbdas, llf, 'b.-') - >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r') - >>> ax.set_xlabel('lmbda parameter') - >>> ax.set_ylabel('Box-Cox log-likelihood') - - Now add some probability plots to show that where the log-likelihood is - maximized the data transformed with `boxcox` looks closest to normal: - - >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right' - >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs): - ... xt = stats.boxcox(x, lmbda=lmbda) - ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt) - ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc) - ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-') - ... ax_inset.set_xticklabels([]) - ... ax_inset.set_yticklabels([]) - ... ax_inset.set_title('$\lambda=%1.2f$' % lmbda) - - >>> plt.show() - - """ - data = np.asarray(data) - N = data.shape[0] - if N == 0: - return np.nan - - y = boxcox(data, lmb) - y_mean = np.mean(y, axis=0) - llf = (lmb - 1) * np.sum(np.log(data), axis=0) - llf -= N / 2.0 * np.log(np.sum((y - y_mean)**2. / N, axis=0)) - return llf - - -def _boxcox_conf_interval(x, lmax, alpha): - # Need to find the lambda for which - # f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1 - fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1) - target = boxcox_llf(lmax, x) - fac - - def rootfunc(lmbda, data, target): - return boxcox_llf(lmbda, data) - target - - # Find positive endpoint of interval in which answer is to be found - newlm = lmax + 0.5 - N = 0 - while (rootfunc(newlm, x, target) > 0.0) and (N < 500): - newlm += 0.1 - N += 1 - - if N == 500: - raise RuntimeError("Could not find endpoint.") - - lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target)) - - # Now find negative interval in the same way - newlm = lmax - 0.5 - N = 0 - while (rootfunc(newlm, x, target) > 0.0) and (N < 500): - newlm -= 0.1 - N += 1 - - if N == 500: - raise RuntimeError("Could not find endpoint.") - - lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target)) - return lmminus, lmplus - - -def boxcox(x, lmbda=None, alpha=None): - r""" - Return a positive dataset transformed by a Box-Cox power transformation. - - Parameters - ---------- - x : ndarray - Input array. Should be 1-dimensional. - lmbda : {None, scalar}, optional - If `lmbda` is not None, do the transformation for that value. - - If `lmbda` is None, find the lambda that maximizes the log-likelihood - function and return it as the second output argument. - alpha : {None, float}, optional - If `alpha` is not None, return the ``100 * (1-alpha)%`` confidence - interval for `lmbda` as the third output argument. - Must be between 0.0 and 1.0. - - Returns - ------- - boxcox : ndarray - Box-Cox power transformed array. - maxlog : float, optional - If the `lmbda` parameter is None, the second returned argument is - the lambda that maximizes the log-likelihood function. - (min_ci, max_ci) : tuple of float, optional - If `lmbda` parameter is None and `alpha` is not None, this returned - tuple of floats represents the minimum and maximum confidence limits - given `alpha`. - - See Also - -------- - probplot, boxcox_normplot, boxcox_normmax, boxcox_llf - - Notes - ----- - The Box-Cox transform is given by:: - - y = (x**lmbda - 1) / lmbda, for lmbda > 0 - log(x), for lmbda = 0 - - `boxcox` requires the input data to be positive. Sometimes a Box-Cox - transformation provides a shift parameter to achieve this; `boxcox` does - not. Such a shift parameter is equivalent to adding a positive constant to - `x` before calling `boxcox`. - - The confidence limits returned when `alpha` is provided give the interval - where: - - .. math:: - - llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1), - - with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared - function. - - References - ---------- - G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the - Royal Statistical Society B, 26, 211-252 (1964). - - Examples - -------- - >>> from scipy import stats - >>> import matplotlib.pyplot as plt - - We generate some random variates from a non-normal distribution and make a - probability plot for it, to show it is non-normal in the tails: - - >>> fig = plt.figure() - >>> ax1 = fig.add_subplot(211) - >>> x = stats.loggamma.rvs(5, size=500) + 5 - >>> stats.probplot(x, dist=stats.norm, plot=ax1) - >>> ax1.set_xlabel('') - >>> ax1.set_title('Probplot against normal distribution') - - We now use `boxcox` to transform the data so it's closest to normal: - - >>> ax2 = fig.add_subplot(212) - >>> xt, _ = stats.boxcox(x) - >>> stats.probplot(xt, dist=stats.norm, plot=ax2) - >>> ax2.set_title('Probplot after Box-Cox transformation') - - >>> plt.show() - - """ - x = np.asarray(x) - if x.size == 0: - return x - - if any(x <= 0): - raise ValueError("Data must be positive.") - - if lmbda is not None: # single transformation - return special.boxcox(x, lmbda) - - # If lmbda=None, find the lmbda that maximizes the log-likelihood function. - lmax = boxcox_normmax(x, method='mle') - y = boxcox(x, lmax) - - if alpha is None: - return y, lmax - else: - # Find confidence interval - interval = _boxcox_conf_interval(x, lmax, alpha) - return y, lmax, interval - - -def boxcox_normmax(x, brack=(-2.0, 2.0), method='pearsonr'): - """Compute optimal Box-Cox transform parameter for input data. - - Parameters - ---------- - x : array_like - Input array. - brack : 2-tuple, optional - The starting interval for a downhill bracket search with - `optimize.brent`. Note that this is in most cases not critical; the - final result is allowed to be outside this bracket. - method : str, optional - The method to determine the optimal transform parameter (`boxcox` - ``lmbda`` parameter). Options are: - - 'pearsonr' (default) - Maximizes the Pearson correlation coefficient between - ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be - normally-distributed. - - 'mle' - Minimizes the log-likelihood `boxcox_llf`. This is the method used - in `boxcox`. - - 'all' - Use all optimization methods available, and return all results. - Useful to compare different methods. - - Returns - ------- - maxlog : float or ndarray - The optimal transform parameter found. An array instead of a scalar - for ``method='all'``. - - See Also - -------- - boxcox, boxcox_llf, boxcox_normplot - - Examples - -------- - >>> from scipy import stats - >>> import matplotlib.pyplot as plt - >>> np.random.seed(1234) # make this example reproducible - - Generate some data and determine optimal ``lmbda`` in various ways: - - >>> x = stats.loggamma.rvs(5, size=30) + 5 - >>> y, lmax_mle = stats.boxcox(x) - >>> lmax_pearsonr = stats.boxcox_normmax(x) - - >>> lmax_mle - 7.177... - >>> lmax_pearsonr - 7.916... - >>> stats.boxcox_normmax(x, method='all') - array([ 7.91667384, 7.17718692]) - - >>> fig = plt.figure() - >>> ax = fig.add_subplot(111) - >>> stats.boxcox_normplot(x, -10, 10, plot=ax) - >>> ax.axvline(lmax_mle, color='r') - >>> ax.axvline(lmax_pearsonr, color='g', ls='--') - - >>> plt.show() - - """ - def _pearsonr(x, brack): - osm_uniform = _calc_uniform_order_statistic_medians(x) - xvals = distributions.norm.ppf(osm_uniform) - - def _eval_pearsonr(lmbda, xvals, samps): - # This function computes the x-axis values of the probability plot - # and computes a linear regression (including the correlation) and - # returns ``1 - r`` so that a minimization function maximizes the - # correlation. - y = boxcox(samps, lmbda) - yvals = np.sort(y) - r, prob = stats.pearsonr(xvals, yvals) - return 1 - r - - return optimize.brent(_eval_pearsonr, brack=brack, args=(xvals, x)) - - def _mle(x, brack): - def _eval_mle(lmb, data): - # function to minimize - return -boxcox_llf(lmb, data) - - return optimize.brent(_eval_mle, brack=brack, args=(x,)) - - def _all(x, brack): - maxlog = np.zeros(2, dtype=np.float) - maxlog[0] = _pearsonr(x, brack) - maxlog[1] = _mle(x, brack) - return maxlog - - methods = {'pearsonr': _pearsonr, - 'mle': _mle, - 'all': _all} - if method not in methods.keys(): - raise ValueError("Method %s not recognized." % method) - - optimfunc = methods[method] - return optimfunc(x, brack) - - -def boxcox_normplot(x, la, lb, plot=None, N=80): - """Compute parameters for a Box-Cox normality plot, optionally show it. - - A Box-Cox normality plot shows graphically what the best transformation - parameter is to use in `boxcox` to obtain a distribution that is close - to normal. - - Parameters - ---------- - x : array_like - Input array. - la, lb : scalar - The lower and upper bounds for the ``lmbda`` values to pass to `boxcox` - for Box-Cox transformations. These are also the limits of the - horizontal axis of the plot if that is generated. - plot : object, optional - If given, plots the quantiles and least squares fit. - `plot` is an object that has to have methods "plot" and "text". - The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, - or a custom object with the same methods. - Default is None, which means that no plot is created. - N : int, optional - Number of points on the horizontal axis (equally distributed from - `la` to `lb`). - - Returns - ------- - lmbdas : ndarray - The ``lmbda`` values for which a Box-Cox transform was done. - ppcc : ndarray - Probability Plot Correlelation Coefficient, as obtained from `probplot` - when fitting the Box-Cox transformed input `x` against a normal - distribution. - - See Also - -------- - probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max - - Notes - ----- - Even if `plot` is given, the figure is not shown or saved by - `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')`` - should be used after calling `probplot`. - - Examples - -------- - >>> from scipy import stats - >>> import matplotlib.pyplot as plt - - Generate some non-normally distributed data, and create a Box-Cox plot: - - >>> x = stats.loggamma.rvs(5, size=500) + 5 - >>> fig = plt.figure() - >>> ax = fig.add_subplot(111) - >>> stats.boxcox_normplot(x, -20, 20, plot=ax) - - Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in - the same plot: - - >>> _, maxlog = stats.boxcox(x) - >>> ax.axvline(maxlog, color='r') - - >>> plt.show() - - """ - x = np.asarray(x) - if x.size == 0: - return x - - if lb <= la: - raise ValueError("`lb` has to be larger than `la`.") - - lmbdas = np.linspace(la, lb, num=N) - ppcc = lmbdas * 0.0 - for i, val in enumerate(lmbdas): - # Determine for each lmbda the correlation coefficient of transformed x - z = boxcox(x, lmbda=val) - _, r2 = probplot(z, dist='norm', fit=True) - ppcc[i] = r2[-1] - - if plot is not None: - plot.plot(lmbdas, ppcc, 'x') - try: - if hasattr(plot, 'set_title'): - # Matplotlib Axes instance or something that looks like it - plot.set_title('Box-Cox Normality Plot') - plot.set_ylabel('Prob Plot Corr. Coef.') - plot.set_xlabel('$\lambda$') - else: - # matplotlib.pyplot module - plot.title('Box-Cox Normality Plot') - plot.ylabel('Prob Plot Corr. Coef.') - plot.xlabel('$\lambda$') - except Exception: - # Not an MPL object or something that looks (enough) like it. - # Don't crash on adding labels or title - pass - - return lmbdas, ppcc - - -def shapiro(x, a=None, reta=False): - """ - Perform the Shapiro-Wilk test for normality. - - The Shapiro-Wilk test tests the null hypothesis that the - data was drawn from a normal distribution. - - Parameters - ---------- - x : array_like - Array of sample data. - a : array_like, optional - Array of internal parameters used in the calculation. If these - are not given, they will be computed internally. If x has length - n, then a must have length n/2. - reta : bool, optional - Whether or not to return the internally computed a values. The - default is False. - - Returns - ------- - W : float - The test statistic. - p-value : float - The p-value for the hypothesis test. - a : array_like, optional - If `reta` is True, then these are the internally computed "a" - values that may be passed into this function on future calls. - - See Also - -------- - anderson : The Anderson-Darling test for normality - - References - ---------- - .. [1] http://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm - - """ - N = len(x) - if N < 3: - raise ValueError("Data must be at least length 3.") - if a is None: - a = zeros(N,'f') - init = 0 - else: - if len(a) != N//2: - raise ValueError("len(a) must equal len(x)/2") - init = 1 - y = sort(x) - a, w, pw, ifault = statlib.swilk(y, a[:N//2], init) - if ifault not in [0,2]: - warnings.warn(str(ifault)) - if N > 5000: - warnings.warn("p-value may not be accurate for N > 5000.") - if reta: - return w, pw, a - else: - return w, pw - -# Values from Stephens, M A, "EDF Statistics for Goodness of Fit and -# Some Comparisons", Journal of he American Statistical -# Association, Vol. 69, Issue 347, Sept. 1974, pp 730-737 -_Avals_norm = array([0.576, 0.656, 0.787, 0.918, 1.092]) -_Avals_expon = array([0.922, 1.078, 1.341, 1.606, 1.957]) -# From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution", -# Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588. -_Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038]) -# From Stephens, M A, "Tests of Fit for the Logistic Distribution Based -# on the Empirical Distribution Function.", Biometrika, -# Vol. 66, Issue 3, Dec. 1979, pp 591-595. -_Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010]) - - -def anderson(x,dist='norm'): - """ - Anderson-Darling test for data coming from a particular distribution - - The Anderson-Darling test is a modification of the Kolmogorov- - Smirnov test `kstest` for the null hypothesis that a sample is - drawn from a population that follows a particular distribution. - For the Anderson-Darling test, the critical values depend on - which distribution is being tested against. This function works - for normal, exponential, logistic, or Gumbel (Extreme Value - Type I) distributions. - - Parameters - ---------- - x : array_like - array of sample data - dist : {'norm','expon','logistic','gumbel','extreme1'}, optional - the type of distribution to test against. The default is 'norm' - and 'extreme1' is a synonym for 'gumbel' - - Returns - ------- - A2 : float - The Anderson-Darling test statistic - critical : list - The critical values for this distribution - sig : list - The significance levels for the corresponding critical values - in percents. The function returns critical values for a - differing set of significance levels depending on the - distribution that is being tested against. - - Notes - ----- - Critical values provided are for the following significance levels: - - normal/exponenential - 15%, 10%, 5%, 2.5%, 1% - logistic - 25%, 10%, 5%, 2.5%, 1%, 0.5% - Gumbel - 25%, 10%, 5%, 2.5%, 1% - - If A2 is larger than these critical values then for the corresponding - significance level, the null hypothesis that the data come from the - chosen distribution can be rejected. - - References - ---------- - .. [1] http://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm - .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and - Some Comparisons, Journal of the American Statistical Association, - Vol. 69, pp. 730-737. - .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit - Statistics with Unknown Parameters, Annals of Statistics, Vol. 4, - pp. 357-369. - .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value - Distribution, Biometrika, Vol. 64, pp. 583-588. - .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference - to Tests for Exponentiality , Technical Report No. 262, - Department of Statistics, Stanford University, Stanford, CA. - .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution - Based on the Empirical Distribution Function, Biometrika, Vol. 66, - pp. 591-595. - - """ - if dist not in ['norm','expon','gumbel','extreme1','logistic']: - raise ValueError("Invalid distribution; dist must be 'norm', " - "'expon', 'gumbel', 'extreme1' or 'logistic'.") - y = sort(x) - xbar = np.mean(x, axis=0) - N = len(y) - if dist == 'norm': - s = np.std(x, ddof=1, axis=0) - w = (y-xbar)/s - z = distributions.norm.cdf(w) - sig = array([15,10,5,2.5,1]) - critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N),3) - elif dist == 'expon': - w = y / xbar - z = distributions.expon.cdf(w) - sig = array([15,10,5,2.5,1]) - critical = around(_Avals_expon / (1.0 + 0.6/N),3) - elif dist == 'logistic': - def rootfunc(ab,xj,N): - a,b = ab - tmp = (xj-a)/b - tmp2 = exp(tmp) - val = [sum(1.0/(1+tmp2),axis=0)-0.5*N, - sum(tmp*(1.0-tmp2)/(1+tmp2),axis=0)+N] - return array(val) - sol0 = array([xbar,np.std(x, ddof=1, axis=0)]) - sol = optimize.fsolve(rootfunc,sol0,args=(x,N),xtol=1e-5) - w = (y-sol[0])/sol[1] - z = distributions.logistic.cdf(w) - sig = array([25,10,5,2.5,1,0.5]) - critical = around(_Avals_logistic / (1.0+0.25/N),3) - else: # (dist == 'gumbel') or (dist == 'extreme1'): - # the following is incorrect, see ticket:1097 - #def fixedsolve(th,xj,N): - # val = stats.sum(xj)*1.0/N - # tmp = exp(-xj/th) - # term = sum(xj*tmp,axis=0) - # term /= sum(tmp,axis=0) - # return val - term - #s = optimize.fixed_point(fixedsolve, 1.0, args=(x,N),xtol=1e-5) - #xbar = -s*log(sum(exp(-x/s),axis=0)*1.0/N) - xbar, s = distributions.gumbel_l.fit(x) - w = (y-xbar)/s - z = distributions.gumbel_l.cdf(w) - sig = array([25,10,5,2.5,1]) - critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)),3) - - i = arange(1,N+1) - S = sum((2*i-1.0)/N*(log(z)+log(1-z[::-1])),axis=0) - A2 = -N-S - return A2, critical, sig - - -def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N): - """ - Compute A2akN equation 7 of Scholz and Stephens. - - Parameters - ---------- - samples : sequence of 1-D array_like - Array of sample arrays. - Z : array_like - Sorted array of all observations. - Zstar : array_like - Sorted array of unique observations. - k : int - Number of samples. - n : array_like - Number of observations in each sample. - N : int - Total number of observations. - - Returns - ------- - A2aKN : float - The A2aKN statistics of Scholz and Stephens 1987. - """ - - A2akN = 0. - Z_ssorted_left = Z.searchsorted(Zstar, 'left') - if N == Zstar.size: - lj = 1. - else: - lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left - Bj = Z_ssorted_left + lj / 2. - for i in arange(0, k): - s = np.sort(samples[i]) - s_ssorted_right = s.searchsorted(Zstar, side='right') - Mij = s_ssorted_right.astype(np.float) - fij = s_ssorted_right - s.searchsorted(Zstar, 'left') - Mij -= fij / 2. - inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / \ - (Bj * (N - Bj) - N * lj / 4.) - A2akN += inner.sum() / n[i] - A2akN *= (N - 1.) / N - return A2akN - - -def _anderson_ksamp_right(samples, Z, Zstar, k, n, N): - """ - Compute A2akN equation 6 of Scholz & Stephens. - - Parameters - ---------- - samples : sequence of 1-D array_like - Array of sample arrays. - Z : array_like - Sorted array of all observations. - Zstar : array_like - Sorted array of unique observations. - k : int - Number of samples. - n : array_like - Number of observations in each sample. - N : int - Total number of observations. - - Returns - ------- - A2KN : float - The A2KN statistics of Scholz and Stephens 1987. - """ - - A2kN = 0. - lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1], - 'left') - Bj = lj.cumsum() - for i in arange(0, k): - s = np.sort(samples[i]) - Mij = s.searchsorted(Zstar[:-1], side='right') - inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj)) - A2kN += inner.sum() / n[i] - return A2kN - - -def anderson_ksamp(samples, midrank=True): - """The Anderson-Darling test for k-samples. - - The k-sample Anderson-Darling test is a modification of the - one-sample Anderson-Darling test. It tests the null hypothesis - that k-samples are drawn from the same population without having - to specify the distribution function of that population. The - critical values depend on the number of samples. - - Parameters - ---------- - samples : sequence of 1-D array_like - Array of sample data in arrays. - midrank : bool, optional - Type of Anderson-Darling test which is computed. Default - (True) is the midrank test applicable to continuous and - discrete populations. If False, the right side empirical - distribution is used. - - Returns - ------- - A2 : float - Normalized k-sample Anderson-Darling test statistic. - critical : array - The critical values for significance levels 25%, 10%, 5%, 2.5%, 1%. - p : float - An approximate significance level at which the null hypothesis for the - provided samples can be rejected. - - Raises - ------ - ValueError - If less than 2 samples are provided, a sample is empty, or no - distinct observations are in the samples. - - See Also - -------- - ks_2samp : 2 sample Kolmogorov-Smirnov test - anderson : 1 sample Anderson-Darling test - - Notes - ----- - [1]_ Defines three versions of the k-sample Anderson-Darling test: - one for continuous distributions and two for discrete - distributions, in which ties between samples may occur. The - default of this routine is to compute the version based on the - midrank empirical distribution function. This test is applicable - to continuous and discrete data. If midrank is set to False, the - right side empirical distribution is used for a test for discrete - data. According to [1]_, the two discrete test statistics differ - only slightly if a few collisions due to round-off errors occur in - the test not adjusted for ties between samples. - - .. versionadded:: 0.14.0 - - References - ---------- - .. [1] Scholz, F. W and Stephens, M. A. (1987), K-Sample - Anderson-Darling Tests, Journal of the American Statistical - Association, Vol. 82, pp. 918-924. - - Examples - -------- - >>> from scipy import stats - >>> np.random.seed(314159) - - The null hypothesis that the two random samples come from the same - distribution can be rejected at the 5% level because the returned - test value is greater than the critical value for 5% (1.961) but - not at the 2.5% level. The interpolation gives an approximate - significance level of 3.1%: - - >>> stats.anderson_ksamp([np.random.normal(size=50), - ... np.random.normal(loc=0.5, size=30)]) - (2.4615796189876105, - array([ 0.325, 1.226, 1.961, 2.718, 3.752]), - 0.03134990135800783) - - - The null hypothesis cannot be rejected for three samples from an - identical distribution. The approximate p-value (87%) has to be - computed by extrapolation and may not be very accurate: - - >>> stats.anderson_ksamp([np.random.normal(size=50), - ... np.random.normal(size=30), np.random.normal(size=20)]) - (-0.73091722665244196, - array([ 0.44925884, 1.3052767 , 1.9434184 , 2.57696569, 3.41634856]), - 0.8789283903979661) - - """ - k = len(samples) - if (k < 2): - raise ValueError("anderson_ksamp needs at least two samples") - - samples = list(map(np.asarray, samples)) - Z = np.sort(np.hstack(samples)) - N = Z.size - Zstar = np.unique(Z) - if Zstar.size < 2: - raise ValueError("anderson_ksamp needs more than one distinct " - "observation") - - n = np.array([sample.size for sample in samples]) - if any(n == 0): - raise ValueError("anderson_ksamp encountered sample without " - "observations") - - if midrank: - A2kN = _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N) - else: - A2kN = _anderson_ksamp_right(samples, Z, Zstar, k, n, N) - - h = (1. / arange(1, N)).sum() - H = (1. / n).sum() - g = 0 - for l in arange(1, N-1): - inner = np.array([1. / ((N - l) * m) for m in arange(l+1, N)]) - g += inner.sum() - - a = (4*g - 6) * (k - 1) + (10 - 6*g)*H - b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6 - c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h - d = (2*h + 6)*k**2 - 4*h*k - sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.)) - m = k - 1 - A2 = (A2kN - m) / math.sqrt(sigmasq) - - # The b_i values are the interpolation coefficients from Table 2 - # of Scholz and Stephens 1987 - b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326]) - b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822]) - b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396]) - critical = b0 + b1 / math.sqrt(m) + b2 / m - pf = np.polyfit(critical, log(np.array([0.25, 0.1, 0.05, 0.025, 0.01])), 2) - if A2 < critical.min() or A2 > critical.max(): - warnings.warn("approximate p-value will be computed by extrapolation") - - p = math.exp(np.polyval(pf, A2)) - return A2, critical, p - - -def ansari(x,y): - """ - Perform the Ansari-Bradley test for equal scale parameters - - The Ansari-Bradley test is a non-parametric test for the equality - of the scale parameter of the distributions from which two - samples were drawn. - - Parameters - ---------- - x, y : array_like - arrays of sample data - - Returns - ------- - AB : float - The Ansari-Bradley test statistic - p-value : float - The p-value of the hypothesis test - - See Also - -------- - fligner : A non-parametric test for the equality of k variances - mood : A non-parametric test for the equality of two scale parameters - - Notes - ----- - The p-value given is exact when the sample sizes are both less than - 55 and there are no ties, otherwise a normal approximation for the - p-value is used. - - References - ---------- - .. [1] Sprent, Peter and N.C. Smeeton. Applied nonparametric statistical - methods. 3rd ed. Chapman and Hall/CRC. 2001. Section 5.8.2. - - """ - x,y = asarray(x),asarray(y) - n = len(x) - m = len(y) - if m < 1: - raise ValueError("Not enough other observations.") - if n < 1: - raise ValueError("Not enough test observations.") - N = m+n - xy = r_[x,y] # combine - rank = stats.rankdata(xy) - symrank = amin(array((rank,N-rank+1)),0) - AB = sum(symrank[:n],axis=0) - uxy = unique(xy) - repeats = (len(uxy) != len(xy)) - exact = ((m < 55) and (n < 55) and not repeats) - if repeats and ((m < 55) or (n < 55)): - warnings.warn("Ties preclude use of exact statistic.") - if exact: - astart, a1, ifault = statlib.gscale(n,m) - ind = AB-astart - total = sum(a1,axis=0) - if ind < len(a1)/2.0: - cind = int(ceil(ind)) - if (ind == cind): - pval = 2.0*sum(a1[:cind+1],axis=0)/total - else: - pval = 2.0*sum(a1[:cind],axis=0)/total - else: - find = int(floor(ind)) - if (ind == floor(ind)): - pval = 2.0*sum(a1[find:],axis=0)/total - else: - pval = 2.0*sum(a1[find+1:],axis=0)/total - return AB, min(1.0,pval) - - # otherwise compute normal approximation - if N % 2: # N odd - mnAB = n*(N+1.0)**2 / 4.0 / N - varAB = n*m*(N+1.0)*(3+N**2)/(48.0*N**2) - else: - mnAB = n*(N+2.0)/4.0 - varAB = m*n*(N+2)*(N-2.0)/48/(N-1.0) - if repeats: # adjust variance estimates - # compute sum(tj * rj**2,axis=0) - fac = sum(symrank**2,axis=0) - if N % 2: # N odd - varAB = m*n*(16*N*fac-(N+1)**4)/(16.0 * N**2 * (N-1)) - else: # N even - varAB = m*n*(16*fac-N*(N+2)**2)/(16.0 * N * (N-1)) - z = (AB - mnAB)/sqrt(varAB) - pval = distributions.norm.sf(abs(z)) * 2.0 - return AB, pval - - -def bartlett(*args): - """ - Perform Bartlett's test for equal variances - - Bartlett's test tests the null hypothesis that all input samples - are from populations with equal variances. For samples - from significantly non-normal populations, Levene's test - `levene` is more robust. - - Parameters - ---------- - sample1, sample2,... : array_like - arrays of sample data. May be different lengths. - - Returns - ------- - T : float - The test statistic. - p-value : float - The p-value of the test. - - References - ---------- - .. [1] http://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm - - .. [2] Snedecor, George W. and Cochran, William G. (1989), Statistical - Methods, Eighth Edition, Iowa State University Press. - - """ - k = len(args) - if k < 2: - raise ValueError("Must enter at least two input sample vectors.") - Ni = zeros(k) - ssq = zeros(k,'d') - for j in range(k): - Ni[j] = len(args[j]) - ssq[j] = np.var(args[j], ddof=1) - Ntot = sum(Ni,axis=0) - spsq = sum((Ni-1)*ssq,axis=0)/(1.0*(Ntot-k)) - numer = (Ntot*1.0-k)*log(spsq) - sum((Ni-1.0)*log(ssq),axis=0) - denom = 1.0 + (1.0/(3*(k-1)))*((sum(1.0/(Ni-1.0),axis=0))-1.0/(Ntot-k)) - T = numer / denom - pval = distributions.chi2.sf(T,k-1) # 1 - cdf - return T, pval - - -def levene(*args, **kwds): - """ - Perform Levene test for equal variances. - - The Levene test tests the null hypothesis that all input samples - are from populations with equal variances. Levene's test is an - alternative to Bartlett's test `bartlett` in the case where - there are significant deviations from normality. - - Parameters - ---------- - sample1, sample2, ... : array_like - The sample data, possibly with different lengths - center : {'mean', 'median', 'trimmed'}, optional - Which function of the data to use in the test. The default - is 'median'. - proportiontocut : float, optional - When `center` is 'trimmed', this gives the proportion of data points - to cut from each end. (See `scipy.stats.trim_mean`.) - Default is 0.05. - - Returns - ------- - W : float - The test statistic. - p-value : float - The p-value for the test. - - Notes - ----- - Three variations of Levene's test are possible. The possibilities - and their recommended usages are: - - * 'median' : Recommended for skewed (non-normal) distributions> - * 'mean' : Recommended for symmetric, moderate-tailed distributions. - * 'trimmed' : Recommended for heavy-tailed distributions. - - References - ---------- - .. [1] http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm - .. [2] Levene, H. (1960). In Contributions to Probability and Statistics: - Essays in Honor of Harold Hotelling, I. Olkin et al. eds., - Stanford University Press, pp. 278-292. - .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American - Statistical Association, 69, 364-367 - - """ - # Handle keyword arguments. - center = 'median' - proportiontocut = 0.05 - for kw, value in kwds.items(): - if kw not in ['center', 'proportiontocut']: - raise TypeError("levene() got an unexpected keyword " - "argument '%s'" % kw) - if kw == 'center': - center = value - else: - proportiontocut = value - - k = len(args) - if k < 2: - raise ValueError("Must enter at least two input sample vectors.") - Ni = zeros(k) - Yci = zeros(k, 'd') - - if center not in ['mean', 'median', 'trimmed']: - raise ValueError("Keyword argument
must be 'mean', 'median'" - + "or 'trimmed'.") - - if center == 'median': - func = lambda x: np.median(x, axis=0) - elif center == 'mean': - func = lambda x: np.mean(x, axis=0) - else: # center == 'trimmed' - args = tuple(stats.trimboth(np.sort(arg), proportiontocut) - for arg in args) - func = lambda x: np.mean(x, axis=0) - - for j in range(k): - Ni[j] = len(args[j]) - Yci[j] = func(args[j]) - Ntot = sum(Ni, axis=0) - - # compute Zij's - Zij = [None]*k - for i in range(k): - Zij[i] = abs(asarray(args[i])-Yci[i]) - # compute Zbari - Zbari = zeros(k, 'd') - Zbar = 0.0 - for i in range(k): - Zbari[i] = np.mean(Zij[i], axis=0) - Zbar += Zbari[i]*Ni[i] - Zbar /= Ntot - - numer = (Ntot-k) * sum(Ni*(Zbari-Zbar)**2, axis=0) - - # compute denom_variance - dvar = 0.0 - for i in range(k): - dvar += sum((Zij[i]-Zbari[i])**2, axis=0) - - denom = (k-1.0)*dvar - - W = numer / denom - pval = distributions.f.sf(W, k-1, Ntot-k) # 1 - cdf - return W, pval - - -@setastest(False) -def binom_test(x, n=None, p=0.5): - """ - Perform a test that the probability of success is p. - - This is an exact, two-sided test of the null hypothesis - that the probability of success in a Bernoulli experiment - is `p`. - - Parameters - ---------- - x : integer or array_like - the number of successes, or if x has length 2, it is the - number of successes and the number of failures. - n : integer - the number of trials. This is ignored if x gives both the - number of successes and failures - p : float, optional - The hypothesized probability of success. 0 <= p <= 1. The - default value is p = 0.5 - - Returns - ------- - p-value : float - The p-value of the hypothesis test - - References - ---------- - .. [1] http://en.wikipedia.org/wiki/Binomial_test - - """ - x = atleast_1d(x).astype(np.integer) - if len(x) == 2: - n = x[1]+x[0] - x = x[0] - elif len(x) == 1: - x = x[0] - if n is None or n < x: - raise ValueError("n must be >= x") - n = np.int_(n) - else: - raise ValueError("Incorrect length for x.") - - if (p > 1.0) or (p < 0.0): - raise ValueError("p must be in range [0,1]") - - d = distributions.binom.pmf(x, n, p) - rerr = 1+1e-7 - if (x == p*n): - # special case as shortcut, would also be handled by `else` below - pval = 1. - elif (x < p*n): - i = np.arange(np.ceil(p*n), n+1) - y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0) - pval = (distributions.binom.cdf(x, n, p) + - distributions.binom.sf(n-y, n, p)) - else: - i = np.arange(np.floor(p*n) + 1) - y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0) - pval = (distributions.binom.cdf(y-1, n, p) + - distributions.binom.sf(x-1, n, p)) - - return min(1.0, pval) - - -def _apply_func(x, g, func): - # g is list of indices into x - # separating x into different groups - # func should be applied over the groups - g = unique(r_[0, g, len(x)]) - output = [] - for k in range(len(g)-1): - output.append(func(x[g[k]:g[k+1]])) - return asarray(output) - - -def fligner(*args, **kwds): - """ - Perform Fligner's test for equal variances. - - Fligner's test tests the null hypothesis that all input samples - are from populations with equal variances. Fligner's test is - non-parametric in contrast to Bartlett's test `bartlett` and - Levene's test `levene`. - - Parameters - ---------- - sample1, sample2, ... : array_like - Arrays of sample data. Need not be the same length. - center : {'mean', 'median', 'trimmed'}, optional - Keyword argument controlling which function of the data is used in - computing the test statistic. The default is 'median'. - proportiontocut : float, optional - When `center` is 'trimmed', this gives the proportion of data points - to cut from each end. (See `scipy.stats.trim_mean`.) - Default is 0.05. - - Returns - ------- - Xsq : float - The test statistic. - p-value : float - The p-value for the hypothesis test. - - Notes - ----- - As with Levene's test there are three variants of Fligner's test that - differ by the measure of central tendency used in the test. See `levene` - for more information. - - References - ---------- - .. [1] http://www.stat.psu.edu/~bgl/center/tr/TR993.ps - - .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample - tests for scale. 'Journal of the American Statistical Association.' - 71(353), 210-213. - - """ - # Handle keyword arguments. - center = 'median' - proportiontocut = 0.05 - for kw, value in kwds.items(): - if kw not in ['center', 'proportiontocut']: - raise TypeError("fligner() got an unexpected keyword " - "argument '%s'" % kw) - if kw == 'center': - center = value - else: - proportiontocut = value - - k = len(args) - if k < 2: - raise ValueError("Must enter at least two input sample vectors.") - - if center not in ['mean','median','trimmed']: - raise ValueError("Keyword argument
must be 'mean', 'median'" - + "or 'trimmed'.") - - if center == 'median': - func = lambda x: np.median(x, axis=0) - elif center == 'mean': - func = lambda x: np.mean(x, axis=0) - else: # center == 'trimmed' - args = tuple(stats.trimboth(arg, proportiontocut) for arg in args) - func = lambda x: np.mean(x, axis=0) - - Ni = asarray([len(args[j]) for j in range(k)]) - Yci = asarray([func(args[j]) for j in range(k)]) - Ntot = sum(Ni, axis=0) - # compute Zij's - Zij = [abs(asarray(args[i]) - Yci[i]) for i in range(k)] - allZij = [] - g = [0] - for i in range(k): - allZij.extend(list(Zij[i])) - g.append(len(allZij)) - - ranks = stats.rankdata(allZij) - a = distributions.norm.ppf(ranks/(2*(Ntot + 1.0)) + 0.5) - - # compute Aibar - Aibar = _apply_func(a, g, sum) / Ni - anbar = np.mean(a, axis=0) - varsq = np.var(a, axis=0, ddof=1) - Xsq = sum(Ni*(asarray(Aibar) - anbar)**2.0, axis=0)/varsq - pval = distributions.chi2.sf(Xsq, k - 1) # 1 - cdf - return Xsq, pval - - -def mood(x, y, axis=0): - """ - Perform Mood's test for equal scale parameters. - - Mood's two-sample test for scale parameters is a non-parametric - test for the null hypothesis that two samples are drawn from the - same distribution with the same scale parameter. - - Parameters - ---------- - x, y : array_like - Arrays of sample data. - axis: int, optional - The axis along which the samples are tested. `x` and `y` can be of - different length along `axis`. - If `axis` is None, `x` and `y` are flattened and the test is done on - all values in the flattened arrays. - - Returns - ------- - z : scalar or ndarray - The z-score for the hypothesis test. For 1-D inputs a scalar is - returned. - p-value : scalar ndarray - The p-value for the hypothesis test. - - See Also - -------- - fligner : A non-parametric test for the equality of k variances - ansari : A non-parametric test for the equality of 2 variances - bartlett : A parametric test for equality of k variances in normal samples - levene : A parametric test for equality of k variances - - Notes - ----- - The data are assumed to be drawn from probability distributions ``f(x)`` - and ``f(x/s) / s`` respectively, for some probability density function f. - The null hypothesis is that ``s == 1``. - - For multi-dimensional arrays, if the inputs are of shapes - ``(n0, n1, n2, n3)`` and ``(n0, m1, n2, n3)``, then if ``axis=1``, the - resulting z and p values will have shape ``(n0, n2, n3)``. Note that - ``n1`` and ``m1`` don't have to be equal, but the other dimensions do. - - Examples - -------- - >>> from scipy import stats - >>> x2 = np.random.randn(2, 45, 6, 7) - >>> x1 = np.random.randn(2, 30, 6, 7) - >>> z, p = stats.mood(x1, x2, axis=1) - >>> p.shape - (2, 6, 7) - - Find the number of points where the difference in scale is not significant: - - >>> (p > 0.1).sum() - 74 - - Perform the test with different scales: - - >>> x1 = np.random.randn(2, 30) - >>> x2 = np.random.randn(2, 35) * 10.0 - >>> stats.mood(x1, x2, axis=1) - (array([-5.84332354, -5.6840814 ]), array([5.11694980e-09, 1.31517628e-08])) - - """ - x = np.asarray(x, dtype=float) - y = np.asarray(y, dtype=float) - - if axis is None: - x = x.flatten() - y = y.flatten() - axis = 0 - - # Determine shape of the result arrays - res_shape = tuple([x.shape[ax] for ax in range(len(x.shape)) if ax != axis]) - if not (res_shape == tuple([y.shape[ax] for ax in range(len(y.shape)) if - ax != axis])): - raise ValueError("Dimensions of x and y on all axes except `axis` " - "should match") - - n = x.shape[axis] - m = y.shape[axis] - N = m + n - if N < 3: - raise ValueError("Not enough observations.") - - xy = np.concatenate((x, y), axis=axis) - if axis != 0: - xy = np.rollaxis(xy, axis) - - xy = xy.reshape(xy.shape[0], -1) - - # Generalized to the n-dimensional case by adding the axis argument, and - # using for loops, since rankdata is not vectorized. For improving - # performance consider vectorizing rankdata function. - all_ranks = np.zeros_like(xy) - for j in range(xy.shape[1]): - all_ranks[:, j] = stats.rankdata(xy[:, j]) - - Ri = all_ranks[:n] - M = sum((Ri - (N + 1.0) / 2) ** 2, axis=0) - # Approx stat. - mnM = n * (N * N - 1.0) / 12 - varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180 - z = (M - mnM) / sqrt(varM) - - # sf for right tail, cdf for left tail. Factor 2 for two-sidedness - z_pos = z > 0 - pval = np.zeros_like(z) - pval[z_pos] = 2 * distributions.norm.sf(z[z_pos]) - pval[~z_pos] = 2 * distributions.norm.cdf(z[~z_pos]) - - if res_shape == (): - # Return scalars, not 0-D arrays - z = z[0] - pval = pval[0] - else: - z.shape = res_shape - pval.shape = res_shape - - return z, pval - - -def wilcoxon(x, y=None, zero_method="wilcox", correction=False): - """ - Calculate the Wilcoxon signed-rank test. - - The Wilcoxon signed-rank test tests the null hypothesis that two - related paired samples come from the same distribution. In particular, - it tests whether the distribution of the differences x - y is symmetric - about zero. It is a non-parametric version of the paired T-test. - - Parameters - ---------- - x : array_like - The first set of measurements. - y : array_like, optional - The second set of measurements. If `y` is not given, then the `x` - array is considered to be the differences between the two sets of - measurements. - zero_method : string, {"pratt", "wilcox", "zsplit"}, optional - "pratt": - Pratt treatment: includes zero-differences in the ranking process - (more conservative) - "wilcox": - Wilcox treatment: discards all zero-differences - "zsplit": - Zero rank split: just like Pratt, but spliting the zero rank - between positive and negative ones - correction : bool, optional - If True, apply continuity correction by adjusting the Wilcoxon rank - statistic by 0.5 towards the mean value when computing the - z-statistic. Default is False. - - Returns - ------- - T : float - The sum of the ranks of the differences above or below zero, whichever - is smaller. - p-value : float - The two-sided p-value for the test. - - Notes - ----- - Because the normal approximation is used for the calculations, the - samples used should be large. A typical rule is to require that - n > 20. - - References - ---------- - .. [1] http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test - - """ - - if zero_method not in ["wilcox", "pratt", "zsplit"]: - raise ValueError("Zero method should be either 'wilcox' \ - or 'pratt' or 'zsplit'") - - if y is None: - d = x - else: - x, y = map(asarray, (x, y)) - if len(x) != len(y): - raise ValueError('Unequal N in wilcoxon. Aborting.') - d = x-y - - if zero_method == "wilcox": - d = compress(not_equal(d, 0), d, axis=-1) # Keep all non-zero differences - - count = len(d) - if (count < 10): - warnings.warn("Warning: sample size too small for normal approximation.") - r = stats.rankdata(abs(d)) - r_plus = sum((d > 0) * r, axis=0) - r_minus = sum((d < 0) * r, axis=0) - - if zero_method == "zsplit": - r_zero = sum((d == 0) * r, axis=0) - r_plus += r_zero / 2. - r_minus += r_zero / 2. - - T = min(r_plus, r_minus) - mn = count*(count + 1.) * 0.25 - se = count*(count + 1.) * (2. * count + 1.) - - if zero_method == "pratt": - r = r[d != 0] - - replist, repnum = find_repeats(r) - if repnum.size != 0: - # Correction for repeated elements. - se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() - - se = sqrt(se / 24) - correction = 0.5 * int(bool(correction)) * np.sign(T - mn) - z = (T - mn - correction) / se - prob = 2. * distributions.norm.sf(abs(z)) - return T, prob - - -@setastest(False) -def median_test(*args, **kwds): - """ - Mood's median test. - - Test that two or more samples come from populations with the same median. - - Let ``n = len(args)`` be the number of samples. The "grand median" of - all the data is computed, and a contingency table is formed by - classifying the values in each sample as being above or below the grand - median. The contingency table, along with `correction` and `lambda_`, - are passed to `scipy.stats.chi2_contingency` to compute the test statistic - and p-value. - - Parameters - ---------- - sample1, sample2, ... : array_like - The set of samples. There must be at least two samples. - Each sample must be a one-dimensional sequence containing at least - one value. The samples are not required to have the same length. - ties : str, optional - Determines how values equal to the grand median are classified in - the contingency table. The string must be one of:: - - "below": - Values equal to the grand median are counted as "below". - "above": - Values equal to the grand median are counted as "above". - "ignore": - Values equal to the grand median are not counted. - - The default is "below". - correction : bool, optional - If True, *and* there are just two samples, apply Yates' correction - for continuity when computing the test statistic associated with - the contingency table. Default is True. - lambda_ : float or str, optional. - By default, the statistic computed in this test is Pearson's - chi-squared statistic. `lambda_` allows a statistic from the - Cressie-Read power divergence family to be used instead. See - `power_divergence` for details. - Default is 1 (Pearson's chi-squared statistic). - - Returns - ------- - stat : float - The test statistic. The statistic that is returned is determined by - `lambda_`. The default is Pearson's chi-squared statistic. - p : float - The p-value of the test. - m : float - The grand median. - table : ndarray - The contingency table. The shape of the table is (2, n), where - n is the number of samples. The first row holds the counts of the - values above the grand median, and the second row holds the counts - of the values below the grand median. The table allows further - analysis with, for example, `scipy.stats.chi2_contingency`, or with - `scipy.stats.fisher_exact` if there are two samples, without having - to recompute the table. - - See Also - -------- - kruskal : Compute the Kruskal-Wallis H-test for independent samples. - mannwhitneyu : Computes the Mann-Whitney rank test on samples x and y. - - Notes - ----- - .. versionadded:: 0.15.0 - - References - ---------- - .. [1] Mood, A. M., Introduction to the Theory of Statistics. McGraw-Hill - (1950), pp. 394-399. - .. [2] Zar, J. H., Biostatistical Analysis, 5th ed. Prentice Hall (2010). - See Sections 8.12 and 10.15. - - Examples - -------- - A biologist runs an experiment in which there are three groups of plants. - Group 1 has 16 plants, group 2 has 15 plants, and group 3 has 17 plants. - Each plant produces a number of seeds. The seed counts for each group - are:: - - Group 1: 10 14 14 18 20 22 24 25 31 31 32 39 43 43 48 49 - Group 2: 28 30 31 33 34 35 36 40 44 55 57 61 91 92 99 - Group 3: 0 3 9 22 23 25 25 33 34 34 40 45 46 48 62 67 84 - - The following code applies Mood's median test to these samples. - - >>> g1 = [10, 14, 14, 18, 20, 22, 24, 25, 31, 31, 32, 39, 43, 43, 48, 49] - >>> g2 = [28, 30, 31, 33, 34, 35, 36, 40, 44, 55, 57, 61, 91, 92, 99] - >>> g3 = [0, 3, 9, 22, 23, 25, 25, 33, 34, 34, 40, 45, 46, 48, 62, 67, 84] - >>> stat, p, med, tbl = median_test(g1, g2, g3) - - The median is - - >>> med - 34.0 - - and the contingency table is - - >>> tbl - array([[ 5, 10, 7], - [11, 5, 10]]) - - `p` is too large to conclude that the medians are not the same: - - >>> p - 0.12609082774093244 - - The "G-test" can be performed by passing ``lambda_="log-likelihood"`` to - `median_test`. - - >>> g, p, med, tbl = median_test(g1, g2, g3, lambda_="log-likelihood") - >>> p - 0.12224779737117837 - - The median occurs several times in the data, so we'll get a different - result if, for example, ``ties="above"`` is used: - - >>> stat, p, med, tbl = median_test(g1, g2, g3, ties="above") - >>> p - 0.063873276069553273 - - >>> tbl - array([[ 5, 11, 9], - [11, 4, 8]]) - - This example demonstrates that if the data set is not large and there - are values equal to the median, the p-value can be sensitive to the - choice of `ties`. - - """ - ties = kwds.pop('ties', 'below') - correction = kwds.pop('correction', True) - lambda_ = kwds.pop('lambda_', None) - - if len(kwds) > 0: - bad_kwd = kwds.keys()[0] - raise TypeError("median_test() got an unexpected keyword " - "argument %r" % bad_kwd) - - if len(args) < 2: - raise ValueError('median_test requires two or more samples.') - - ties_options = ['below', 'above', 'ignore'] - if ties not in ties_options: - raise ValueError("invalid 'ties' option '%s'; 'ties' must be one " - "of: %s" % (ties, str(ties_options)[1:-1])) - - data = [np.asarray(arg) for arg in args] - - # Validate the sizes and shapes of the arguments. - for k, d in enumerate(data): - if d.size == 0: - raise ValueError("Sample %d is empty. All samples must " - "contain at least one value." % (k + 1)) - if d.ndim != 1: - raise ValueError("Sample %d has %d dimensions. All " - "samples must be one-dimensional sequences." % - (k + 1, d.ndim)) - - grand_median = np.median(np.concatenate(data)) - - # Create the contingency table. - table = np.zeros((2, len(data)), dtype=np.int64) - for k, sample in enumerate(data): - nabove = count_nonzero(sample > grand_median) - nbelow = count_nonzero(sample < grand_median) - nequal = sample.size - (nabove + nbelow) - table[0, k] += nabove - table[1, k] += nbelow - if ties == "below": - table[1, k] += nequal - elif ties == "above": - table[0, k] += nequal - - # Check that no row or column of the table is all zero. - # Such a table can not be given to chi2_contingency, because it would have - # a zero in the table of expected frequencies. - rowsums = table.sum(axis=1) - if rowsums[0] == 0: - raise ValueError("All values are below the grand median (%r)." % - grand_median) - if rowsums[1] == 0: - raise ValueError("All values are above the grand median (%r)." % - grand_median) - if ties == "ignore": - # We already checked that each sample has at least one value, but it - # is possible that all those values equal the grand median. If `ties` - # is "ignore", that would result in a column of zeros in `table`. We - # check for that case here. - zero_cols = np.where((table == 0).all(axis=0))[0] - if len(zero_cols) > 0: - msg = ("All values in sample %d are equal to the grand " - "median (%r), so they are ignored, resulting in an " - "empty sample." % (zero_cols[0] + 1, grand_median)) - raise ValueError(msg) - - stat, p, dof, expected = chi2_contingency(table, lambda_=lambda_, - correction=correction) - return stat, p, grand_median, table - - -def _hermnorm(N): - # return the negatively normalized hermite polynomials up to order N-1 - # (inclusive) - # using the recursive relationship - # p_n+1 = p_n(x)' - x*p_n(x) - # and p_0(x) = 1 - plist = [None]*N - plist[0] = poly1d(1) - for n in range(1,N): - plist[n] = plist[n-1].deriv() - poly1d([1,0])*plist[n-1] - return plist - - -def pdf_fromgamma(g1, g2, g3=0.0, g4=None): - if g4 is None: - g4 = 3*g2*g2 - sigsq = 1.0/g2 - sig = sqrt(sigsq) - mu = g1*sig**3.0 - p12 = _hermnorm(13) - for k in range(13): - p12[k] = p12[k]/sig**k - - # Add all of the terms to polynomial - totp = p12[0] - (g1/6.0*p12[3]) + \ - (g2/24.0*p12[4] + g1*g1/72.0*p12[6]) - \ - (g3/120.0*p12[5] + g1*g2/144.0*p12[7] + g1**3.0/1296.0*p12[9]) + \ - (g4/720*p12[6] + (g2*g2/1152.0+g1*g3/720)*p12[8] + - g1*g1*g2/1728.0*p12[10] + g1**4.0/31104.0*p12[12]) - # Final normalization - totp = totp / sqrt(2*pi)/sig - - def thefunc(x): - xn = (x-mu)/sig - return totp(xn)*exp(-xn*xn/2.0) - return thefunc - - -def _circfuncs_common(samples, high, low): - samples = np.asarray(samples) - if samples.size == 0: - return np.nan, np.nan - - ang = (samples - low)*2*pi / (high-low) - return samples, ang - - -def circmean(samples, high=2*pi, low=0, axis=None): - """ - Compute the circular mean for samples in a range. - - Parameters - ---------- - samples : array_like - Input array. - high : float or int, optional - High boundary for circular mean range. Default is ``2*pi``. - low : float or int, optional - Low boundary for circular mean range. Default is 0. - axis : int, optional - Axis along which means are computed. The default is to compute - the mean of the flattened array. - - Returns - ------- - circmean : float - Circular mean. - - """ - samples, ang = _circfuncs_common(samples, high, low) - res = angle(np.mean(exp(1j*ang), axis=axis)) - mask = res < 0 - if (mask.ndim > 0): - res[mask] += 2*pi - elif mask: - res = res + 2*pi - - return res*(high-low)/2.0/pi + low - - -def circvar(samples, high=2*pi, low=0, axis=None): - """ - Compute the circular variance for samples assumed to be in a range - - Parameters - ---------- - samples : array_like - Input array. - low : float or int, optional - Low boundary for circular variance range. Default is 0. - high : float or int, optional - High boundary for circular variance range. Default is ``2*pi``. - axis : int, optional - Axis along which variances are computed. The default is to compute - the variance of the flattened array. - - Returns - ------- - circvar : float - Circular variance. - - Notes - ----- - This uses a definition of circular variance that in the limit of small - angles returns a number close to the 'linear' variance. - - """ - samples, ang = _circfuncs_common(samples, high, low) - res = np.mean(exp(1j*ang), axis=axis) - R = abs(res) - return ((high-low)/2.0/pi)**2 * 2 * log(1/R) - - -def circstd(samples, high=2*pi, low=0, axis=None): - """ - Compute the circular standard deviation for samples assumed to be in the - range [low to high]. - - Parameters - ---------- - samples : array_like - Input array. - low : float or int, optional - Low boundary for circular standard deviation range. Default is 0. - high : float or int, optional - High boundary for circular standard deviation range. - Default is ``2*pi``. - axis : int, optional - Axis along which standard deviations are computed. The default is - to compute the standard deviation of the flattened array. - - Returns - ------- - circstd : float - Circular standard deviation. - - Notes - ----- - This uses a definition of circular standard deviation that in the limit of - small angles returns a number close to the 'linear' standard deviation. - - """ - samples, ang = _circfuncs_common(samples, high, low) - res = np.mean(exp(1j*ang), axis=axis) - R = abs(res) - return ((high-low)/2.0/pi) * sqrt(-2*log(R)) - - -# Tests to include (from R) -- some of these already in stats. -######## -# X Ansari-Bradley -# X Bartlett (and Levene) -# X Binomial -# Y Pearson's Chi-squared (stats.chisquare) -# Y Association Between Paired samples (stats.pearsonr, stats.spearmanr) -# stats.kendalltau) -- these need work though -# Fisher's exact test -# X Fligner-Killeen Test -# Y Friedman Rank Sum (stats.friedmanchisquare?) -# Y Kruskal-Wallis -# Y Kolmogorov-Smirnov -# Cochran-Mantel-Haenszel Chi-Squared for Count -# McNemar's Chi-squared for Count -# X Mood Two-Sample -# X Test For Equal Means in One-Way Layout (see stats.ttest also) -# Pairwise Comparisons of proportions -# Pairwise t tests -# Tabulate p values for pairwise comparisons -# Pairwise Wilcoxon rank sum tests -# Power calculations two sample test of prop. -# Power calculations for one and two sample t tests -# Equal or Given Proportions -# Trend in Proportions -# Quade Test -# Y Student's T Test -# Y F Test to compare two variances -# XY Wilcoxon Rank Sum and Signed Rank Tests diff --git a/wafo/stats/mstats.py b/wafo/stats/mstats.py deleted file mode 100644 index c5b62cc..0000000 --- a/wafo/stats/mstats.py +++ /dev/null @@ -1,79 +0,0 @@ -""" -=================================================================== -Statistical functions for masked arrays (:mod:`scipy.stats.mstats`) -=================================================================== - -.. currentmodule:: scipy.stats.mstats - -This module contains a large number of statistical functions that can -be used with masked arrays. - -Most of these functions are similar to those in scipy.stats but might -have small differences in the API or in the algorithm used. Since this -is a relatively new package, some API changes are still possible. - -.. autosummary:: - :toctree: generated/ - - argstoarray - betai - chisquare - count_tied_groups - describe - f_oneway - f_value_wilks_lambda - find_repeats - friedmanchisquare - kendalltau - kendalltau_seasonal - kruskalwallis - ks_twosamp - kurtosis - kurtosistest - linregress - mannwhitneyu - plotting_positions - mode - moment - mquantiles - msign - normaltest - obrientransform - pearsonr - plotting_positions - pointbiserialr - rankdata - scoreatpercentile - sem - signaltonoise - skew - skewtest - spearmanr - theilslopes - threshold - tmax - tmean - tmin - trim - trima - trimboth - trimmed_stde - trimr - trimtail - tsem - ttest_onesamp - ttest_ind - ttest_onesamp - ttest_rel - tvar - variation - winsorize - zmap - zscore - -""" -from __future__ import division, print_function, absolute_import - -from .mstats_basic import * -from .mstats_extras import * -from scipy.stats import gmean, hmean diff --git a/wafo/stats/mstats_basic.py b/wafo/stats/mstats_basic.py deleted file mode 100644 index 35de9e1..0000000 --- a/wafo/stats/mstats_basic.py +++ /dev/null @@ -1,2027 +0,0 @@ -""" -An extension of scipy.stats.stats to support masked arrays - -""" -# Original author (2007): Pierre GF Gerard-Marchant - -# TODO : f_value_wilks_lambda looks botched... what are dfnum & dfden for ? -# TODO : ttest_rel looks botched: what are x1,x2,v1,v2 for ? -# TODO : reimplement ksonesamp - -from __future__ import division, print_function, absolute_import - - -__all__ = ['argstoarray', - 'betai', - 'chisquare','count_tied_groups', - 'describe', - 'f_oneway','f_value_wilks_lambda','find_repeats','friedmanchisquare', - 'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis', - 'ks_twosamp','ks_2samp','kurtosis','kurtosistest', - 'linregress', - 'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign', - 'normaltest', - 'obrientransform', - 'pearsonr','plotting_positions','pointbiserialr', - 'rankdata', - 'scoreatpercentile','sem', - 'sen_seasonal_slopes','signaltonoise','skew','skewtest','spearmanr', - 'theilslopes','threshold','tmax','tmean','tmin','trim','trimboth', - 'trimtail','trima','trimr','trimmed_mean','trimmed_std', - 'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp', - 'ttest_ind','ttest_rel','tvar', - 'variation', - 'winsorize', - 'zmap', 'zscore' - ] - -import numpy as np -from numpy import ndarray -import numpy.ma as ma -from numpy.ma import masked, nomask - -from scipy._lib.six import iteritems - -import itertools -import warnings - -from . import stats -from . import distributions -import scipy.special as special -from . import futil - - -genmissingvaldoc = """ - - Notes - ----- - Missing values are considered pair-wise: if a value is missing in x, - the corresponding value in y is masked. - """ - - -def _chk_asarray(a, axis): - # Always returns a masked array, raveled for axis=None - a = ma.asanyarray(a) - if axis is None: - a = ma.ravel(a) - outaxis = 0 - else: - outaxis = axis - return a, outaxis - - -def _chk2_asarray(a, b, axis): - a = ma.asanyarray(a) - b = ma.asanyarray(b) - if axis is None: - a = ma.ravel(a) - b = ma.ravel(b) - outaxis = 0 - else: - outaxis = axis - return a, b, outaxis - - -def _chk_size(a,b): - a = ma.asanyarray(a) - b = ma.asanyarray(b) - (na, nb) = (a.size, b.size) - if na != nb: - raise ValueError("The size of the input array should match!" - " (%s <> %s)" % (na, nb)) - return (a, b, na) - - -def argstoarray(*args): - """ - Constructs a 2D array from a group of sequences. - - Sequences are filled with missing values to match the length of the longest - sequence. - - Parameters - ---------- - args : sequences - Group of sequences. - - Returns - ------- - argstoarray : MaskedArray - A ( `m` x `n` ) masked array, where `m` is the number of arguments and - `n` the length of the longest argument. - - Notes - ----- - `numpy.ma.row_stack` has identical behavior, but is called with a sequence - of sequences. - - """ - if len(args) == 1 and not isinstance(args[0], ndarray): - output = ma.asarray(args[0]) - if output.ndim != 2: - raise ValueError("The input should be 2D") - else: - n = len(args) - m = max([len(k) for k in args]) - output = ma.array(np.empty((n,m), dtype=float), mask=True) - for (k,v) in enumerate(args): - output[k,:len(v)] = v - - output[np.logical_not(np.isfinite(output._data))] = masked - return output - - -def find_repeats(arr): - """Find repeats in arr and return a tuple (repeats, repeat_count). - Masked values are discarded. - - Parameters - ---------- - arr : sequence - Input array. The array is flattened if it is not 1D. - - Returns - ------- - repeats : ndarray - Array of repeated values. - counts : ndarray - Array of counts. - - """ - marr = ma.compressed(arr) - if not marr.size: - return (np.array(0), np.array(0)) - - (v1, v2, n) = futil.dfreps(ma.array(ma.compressed(arr), copy=True)) - return (v1[:n], v2[:n]) - - -def count_tied_groups(x, use_missing=False): - """ - Counts the number of tied values. - - Parameters - ---------- - x : sequence - Sequence of data on which to counts the ties - use_missing : boolean - Whether to consider missing values as tied. - - Returns - ------- - count_tied_groups : dict - Returns a dictionary (nb of ties: nb of groups). - - Examples - -------- - >>> from scipy.stats import mstats - >>> z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6] - >>> mstats.count_tied_groups(z) - {2: 1, 3: 2} - - In the above example, the ties were 0 (3x), 2 (3x) and 3 (2x). - - >>> z = np.ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6]) - >>> mstats.count_tied_groups(z) - {2: 2, 3: 1} - >>> z[[1,-1]] = np.ma.masked - >>> mstats.count_tied_groups(z, use_missing=True) - {2: 2, 3: 1} - - """ - nmasked = ma.getmask(x).sum() - # We need the copy as find_repeats will overwrite the initial data - data = ma.compressed(x).copy() - (ties, counts) = find_repeats(data) - nties = {} - if len(ties): - nties = dict(zip(np.unique(counts), itertools.repeat(1))) - nties.update(dict(zip(*find_repeats(counts)))) - - if nmasked and use_missing: - try: - nties[nmasked] += 1 - except KeyError: - nties[nmasked] = 1 - - return nties - - -def rankdata(data, axis=None, use_missing=False): - """Returns the rank (also known as order statistics) of each data point - along the given axis. - - If some values are tied, their rank is averaged. - If some values are masked, their rank is set to 0 if use_missing is False, - or set to the average rank of the unmasked values if use_missing is True. - - Parameters - ---------- - data : sequence - Input data. The data is transformed to a masked array - axis : {None,int}, optional - Axis along which to perform the ranking. - If None, the array is first flattened. An exception is raised if - the axis is specified for arrays with a dimension larger than 2 - use_missing : {boolean}, optional - Whether the masked values have a rank of 0 (False) or equal to the - average rank of the unmasked values (True). - - """ - def _rank1d(data, use_missing=False): - n = data.count() - rk = np.empty(data.size, dtype=float) - idx = data.argsort() - rk[idx[:n]] = np.arange(1,n+1) - - if use_missing: - rk[idx[n:]] = (n+1)/2. - else: - rk[idx[n:]] = 0 - - repeats = find_repeats(data.copy()) - for r in repeats[0]: - condition = (data == r).filled(False) - rk[condition] = rk[condition].mean() - return rk - - data = ma.array(data, copy=False) - if axis is None: - if data.ndim > 1: - return _rank1d(data.ravel(), use_missing).reshape(data.shape) - else: - return _rank1d(data, use_missing) - else: - return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray) - - -def mode(a, axis=0): - a, axis = _chk_asarray(a, axis) - - def _mode1D(a): - (rep,cnt) = find_repeats(a) - if not cnt.ndim: - return (0, 0) - elif cnt.size: - return (rep[cnt.argmax()], cnt.max()) - else: - not_masked_indices = ma.flatnotmasked_edges(a) - first_not_masked_index = not_masked_indices[0] - return (a[first_not_masked_index], 1) - - if axis is None: - output = _mode1D(ma.ravel(a)) - output = (ma.array(output[0]), ma.array(output[1])) - else: - output = ma.apply_along_axis(_mode1D, axis, a) - newshape = list(a.shape) - newshape[axis] = 1 - slices = [slice(None)] * output.ndim - slices[axis] = 0 - modes = output[tuple(slices)].reshape(newshape) - slices[axis] = 1 - counts = output[tuple(slices)].reshape(newshape) - output = (modes, counts) - return output -mode.__doc__ = stats.mode.__doc__ - - -def betai(a, b, x): - x = np.asanyarray(x) - x = ma.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0 - return special.betainc(a, b, x) -betai.__doc__ = stats.betai.__doc__ - - -def msign(x): - """Returns the sign of x, or 0 if x is masked.""" - return ma.filled(np.sign(x), 0) - - -def pearsonr(x,y): - """ - Calculates a Pearson correlation coefficient and the p-value for testing - non-correlation. - - The Pearson correlation coefficient measures the linear relationship - between two datasets. Strictly speaking, Pearson's correlation requires - that each dataset be normally distributed. Like other correlation - coefficients, this one varies between -1 and +1 with 0 implying no - correlation. Correlations of -1 or +1 imply an exact linear - relationship. Positive correlations imply that as `x` increases, so does - `y`. Negative correlations imply that as `x` increases, `y` decreases. - - The p-value roughly indicates the probability of an uncorrelated system - producing datasets that have a Pearson correlation at least as extreme - as the one computed from these datasets. The p-values are not entirely - reliable but are probably reasonable for datasets larger than 500 or so. - - Parameters - ---------- - x : 1-D array_like - Input - y : 1-D array_like - Input - - Returns - ------- - pearsonr : float - Pearson's correlation coefficient, 2-tailed p-value. - - References - ---------- - http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation - - """ - (x, y, n) = _chk_size(x, y) - (x, y) = (x.ravel(), y.ravel()) - # Get the common mask and the total nb of unmasked elements - m = ma.mask_or(ma.getmask(x), ma.getmask(y)) - n -= m.sum() - df = n-2 - if df < 0: - return (masked, masked) - - (mx, my) = (x.mean(), y.mean()) - (xm, ym) = (x-mx, y-my) - - r_num = ma.add.reduce(xm*ym) - r_den = ma.sqrt(ma.dot(xm,xm) * ma.dot(ym,ym)) - r = r_num / r_den - # Presumably, if r > 1, then it is only some small artifact of floating - # point arithmetic. - r = min(r, 1.0) - r = max(r, -1.0) - df = n - 2 - - if r is masked or abs(r) == 1.0: - prob = 0. - else: - t_squared = (df / ((1.0 - r) * (1.0 + r))) * r * r - prob = betai(0.5*df, 0.5, df/(df + t_squared)) - - return r, prob - - -def spearmanr(x, y, use_ties=True): - """ - Calculates a Spearman rank-order correlation coefficient and the p-value - to test for non-correlation. - - The Spearman correlation is a nonparametric measure of the linear - relationship between two datasets. Unlike the Pearson correlation, the - Spearman correlation does not assume that both datasets are normally - distributed. Like other correlation coefficients, this one varies - between -1 and +1 with 0 implying no correlation. Correlations of -1 or - +1 imply an exact linear relationship. Positive correlations imply that - as `x` increases, so does `y`. Negative correlations imply that as `x` - increases, `y` decreases. - - Missing values are discarded pair-wise: if a value is missing in `x`, the - corresponding value in `y` is masked. - - The p-value roughly indicates the probability of an uncorrelated system - producing datasets that have a Spearman correlation at least as extreme - as the one computed from these datasets. The p-values are not entirely - reliable but are probably reasonable for datasets larger than 500 or so. - - Parameters - ---------- - x : array_like - The length of `x` must be > 2. - y : array_like - The length of `y` must be > 2. - use_ties : bool, optional - Whether the correction for ties should be computed. - - Returns - ------- - spearmanr : float - Spearman correlation coefficient, 2-tailed p-value. - - References - ---------- - [CRCProbStat2000] section 14.7 - - """ - (x, y, n) = _chk_size(x, y) - (x, y) = (x.ravel(), y.ravel()) - - m = ma.mask_or(ma.getmask(x), ma.getmask(y)) - n -= m.sum() - if m is not nomask: - x = ma.array(x, mask=m, copy=True) - y = ma.array(y, mask=m, copy=True) - df = n-2 - if df < 0: - raise ValueError("The input must have at least 3 entries!") - - # Gets the ranks and rank differences - rankx = rankdata(x) - ranky = rankdata(y) - dsq = np.add.reduce((rankx-ranky)**2) - # Tie correction - if use_ties: - xties = count_tied_groups(x) - yties = count_tied_groups(y) - corr_x = np.sum(v*k*(k**2-1) for (k,v) in iteritems(xties))/12. - corr_y = np.sum(v*k*(k**2-1) for (k,v) in iteritems(yties))/12. - else: - corr_x = corr_y = 0 - - denom = n*(n**2 - 1)/6. - if corr_x != 0 or corr_y != 0: - rho = denom - dsq - corr_x - corr_y - rho /= ma.sqrt((denom-2*corr_x)*(denom-2*corr_y)) - else: - rho = 1. - dsq/denom - - t = ma.sqrt(ma.divide(df,(rho+1.0)*(1.0-rho))) * rho - if t is masked: - prob = 0. - else: - prob = betai(0.5*df,0.5,df/(df+t*t)) - - return rho, prob - - -def kendalltau(x, y, use_ties=True, use_missing=False): - """ - Computes Kendall's rank correlation tau on two variables *x* and *y*. - - Parameters - ---------- - xdata : sequence - First data list (for example, time). - ydata : sequence - Second data list. - use_ties : {True, False}, optional - Whether ties correction should be performed. - use_missing : {False, True}, optional - Whether missing data should be allocated a rank of 0 (False) or the - average rank (True) - - Returns - ------- - tau : float - Kendall tau - prob : float - Approximate 2-side p-value. - - """ - (x, y, n) = _chk_size(x, y) - (x, y) = (x.flatten(), y.flatten()) - m = ma.mask_or(ma.getmask(x), ma.getmask(y)) - if m is not nomask: - x = ma.array(x, mask=m, copy=True) - y = ma.array(y, mask=m, copy=True) - n -= m.sum() - - if n < 2: - return (np.nan, np.nan) - - rx = ma.masked_equal(rankdata(x, use_missing=use_missing), 0) - ry = ma.masked_equal(rankdata(y, use_missing=use_missing), 0) - idx = rx.argsort() - (rx, ry) = (rx[idx], ry[idx]) - C = np.sum([((ry[i+1:] > ry[i]) * (rx[i+1:] > rx[i])).filled(0).sum() - for i in range(len(ry)-1)], dtype=float) - D = np.sum([((ry[i+1:] < ry[i])*(rx[i+1:] > rx[i])).filled(0).sum() - for i in range(len(ry)-1)], dtype=float) - if use_ties: - xties = count_tied_groups(x) - yties = count_tied_groups(y) - corr_x = np.sum([v*k*(k-1) for (k,v) in iteritems(xties)], dtype=float) - corr_y = np.sum([v*k*(k-1) for (k,v) in iteritems(yties)], dtype=float) - denom = ma.sqrt((n*(n-1)-corr_x)/2. * (n*(n-1)-corr_y)/2.) - else: - denom = n*(n-1)/2. - tau = (C-D) / denom - - var_s = n*(n-1)*(2*n+5) - if use_ties: - var_s -= np.sum(v*k*(k-1)*(2*k+5)*1. for (k,v) in iteritems(xties)) - var_s -= np.sum(v*k*(k-1)*(2*k+5)*1. for (k,v) in iteritems(yties)) - v1 = np.sum([v*k*(k-1) for (k, v) in iteritems(xties)], dtype=float) *\ - np.sum([v*k*(k-1) for (k, v) in iteritems(yties)], dtype=float) - v1 /= 2.*n*(n-1) - if n > 2: - v2 = np.sum([v*k*(k-1)*(k-2) for (k,v) in iteritems(xties)], - dtype=float) * \ - np.sum([v*k*(k-1)*(k-2) for (k,v) in iteritems(yties)], - dtype=float) - v2 /= 9.*n*(n-1)*(n-2) - else: - v2 = 0 - else: - v1 = v2 = 0 - - var_s /= 18. - var_s += (v1 + v2) - z = (C-D)/np.sqrt(var_s) - prob = special.erfc(abs(z)/np.sqrt(2)) - return (tau, prob) - - -def kendalltau_seasonal(x): - """ - Computes a multivariate Kendall's rank correlation tau, for seasonal data. - - Parameters - ---------- - x : 2-D ndarray - Array of seasonal data, with seasons in columns. - - """ - x = ma.array(x, subok=True, copy=False, ndmin=2) - (n,m) = x.shape - n_p = x.count(0) - - S_szn = np.sum(msign(x[i:]-x[i]).sum(0) for i in range(n)) - S_tot = S_szn.sum() - - n_tot = x.count() - ties = count_tied_groups(x.compressed()) - corr_ties = np.sum(v*k*(k-1) for (k,v) in iteritems(ties)) - denom_tot = ma.sqrt(1.*n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2. - - R = rankdata(x, axis=0, use_missing=True) - K = ma.empty((m,m), dtype=int) - covmat = ma.empty((m,m), dtype=float) - denom_szn = ma.empty(m, dtype=float) - for j in range(m): - ties_j = count_tied_groups(x[:,j].compressed()) - corr_j = np.sum(v*k*(k-1) for (k,v) in iteritems(ties_j)) - cmb = n_p[j]*(n_p[j]-1) - for k in range(j,m,1): - K[j,k] = np.sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum() - for i in range(n)) - covmat[j,k] = (K[j,k] + 4*(R[:,j]*R[:,k]).sum() - - n*(n_p[j]+1)*(n_p[k]+1))/3. - K[k,j] = K[j,k] - covmat[k,j] = covmat[j,k] - - denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2. - - var_szn = covmat.diagonal() - - z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn) - z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum()) - z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum()) - - prob_szn = special.erfc(abs(z_szn)/np.sqrt(2)) - prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2)) - prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2)) - - chi2_tot = (z_szn*z_szn).sum() - chi2_trd = m * z_szn.mean()**2 - output = {'seasonal tau': S_szn/denom_szn, - 'global tau': S_tot/denom_tot, - 'global tau (alt)': S_tot/denom_szn.sum(), - 'seasonal p-value': prob_szn, - 'global p-value (indep)': prob_tot_ind, - 'global p-value (dep)': prob_tot_dep, - 'chi2 total': chi2_tot, - 'chi2 trend': chi2_trd, - } - return output - - -def pointbiserialr(x, y): - x = ma.fix_invalid(x, copy=True).astype(bool) - y = ma.fix_invalid(y, copy=True).astype(float) - # Get rid of the missing data - m = ma.mask_or(ma.getmask(x), ma.getmask(y)) - if m is not nomask: - unmask = np.logical_not(m) - x = x[unmask] - y = y[unmask] - - n = len(x) - # phat is the fraction of x values that are True - phat = x.sum() / float(n) - y0 = y[~x] # y-values where x is False - y1 = y[x] # y-values where x is True - y0m = y0.mean() - y1m = y1.mean() - - rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std() - - df = n-2 - t = rpb*ma.sqrt(df/(1.0-rpb**2)) - prob = betai(0.5*df, 0.5, df/(df+t*t)) - return rpb, prob - -if stats.pointbiserialr.__doc__: - pointbiserialr.__doc__ = stats.pointbiserialr.__doc__ + genmissingvaldoc - - -def linregress(*args): - """ - Linear regression calculation - - Note that the non-masked version is used, and that this docstring is - replaced by the non-masked docstring + some info on missing data. - - """ - if len(args) == 1: - # Input is a single 2-D array containing x and y - args = ma.array(args[0], copy=True) - if len(args) == 2: - x = args[0] - y = args[1] - else: - x = args[:, 0] - y = args[:, 1] - else: - # Input is two 1-D arrays - x = ma.array(args[0]).flatten() - y = ma.array(args[1]).flatten() - - m = ma.mask_or(ma.getmask(x), ma.getmask(y), shrink=False) - if m is not nomask: - x = ma.array(x, mask=m) - y = ma.array(y, mask=m) - if np.any(~m): - slope, intercept, r, prob, sterrest = stats.linregress(x.data[~m], - y.data[~m]) - else: - # All data is masked - return None, None, None, None, None - else: - slope, intercept, r, prob, sterrest = stats.linregress(x.data, y.data) - - return slope, intercept, r, prob, sterrest - -if stats.linregress.__doc__: - linregress.__doc__ = stats.linregress.__doc__ + genmissingvaldoc - - -def theilslopes(y, x=None, alpha=0.95): - y = ma.asarray(y).flatten() - if x is None: - x = ma.arange(len(y), dtype=float) - else: - x = ma.asarray(x).flatten() - if len(x) != len(y): - raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y),len(x))) - - m = ma.mask_or(ma.getmask(x), ma.getmask(y)) - y._mask = x._mask = m - # Disregard any masked elements of x or y - y = y.compressed() - x = x.compressed().astype(float) - # We now have unmasked arrays so can use `stats.theilslopes` - return stats.theilslopes(y, x, alpha=alpha) -theilslopes.__doc__ = stats.theilslopes.__doc__ - - -def sen_seasonal_slopes(x): - x = ma.array(x, subok=True, copy=False, ndmin=2) - (n,_) = x.shape - # Get list of slopes per season - szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None] - for i in range(n)]) - szn_medslopes = ma.median(szn_slopes, axis=0) - medslope = ma.median(szn_slopes, axis=None) - return szn_medslopes, medslope - - -def ttest_1samp(a, popmean, axis=0): - a, axis = _chk_asarray(a, axis) - if a.size == 0: - return (np.nan, np.nan) - - x = a.mean(axis=axis) - v = a.var(axis=axis, ddof=1) - n = a.count(axis=axis) - df = n - 1. - svar = ((n - 1) * v) / df - t = (x - popmean) / ma.sqrt(svar / n) - prob = betai(0.5 * df, 0.5, df / (df + t*t)) - return t, prob -ttest_1samp.__doc__ = stats.ttest_1samp.__doc__ -ttest_onesamp = ttest_1samp - - -def ttest_ind(a, b, axis=0): - a, b, axis = _chk2_asarray(a, b, axis) - if a.size == 0 or b.size == 0: - return (np.nan, np.nan) - - (x1, x2) = (a.mean(axis), b.mean(axis)) - (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1)) - (n1, n2) = (a.count(axis), b.count(axis)) - df = n1 + n2 - 2. - svar = ((n1-1)*v1+(n2-1)*v2) / df - t = (x1-x2)/ma.sqrt(svar*(1.0/n1 + 1.0/n2)) # n-D computation here! - t = ma.filled(t, 1) # replace NaN t-values with 1.0 - probs = betai(0.5 * df, 0.5, df/(df + t*t)).reshape(t.shape) - return t, probs.squeeze() -ttest_ind.__doc__ = stats.ttest_ind.__doc__ - - -def ttest_rel(a, b, axis=0): - a, b, axis = _chk2_asarray(a, b, axis) - if len(a) != len(b): - raise ValueError('unequal length arrays') - - if a.size == 0 or b.size == 0: - return (np.nan, np.nan) - - (x1, x2) = (a.mean(axis), b.mean(axis)) - (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1)) - n = a.count(axis) - df = (n-1.0) - d = (a-b).astype('d') - denom = ma.sqrt((n*ma.add.reduce(d*d,axis) - ma.add.reduce(d,axis)**2) / df) - t = ma.add.reduce(d, axis) / denom - t = ma.filled(t, 1) - probs = betai(0.5*df,0.5,df/(df+t*t)).reshape(t.shape).squeeze() - return t, probs -ttest_rel.__doc__ = stats.ttest_rel.__doc__ - - -# stats.chisquare works with masked arrays, so we don't need to -# implement it here. -# For backwards compatibilty, stats.chisquare is included in -# the stats.mstats namespace. -chisquare = stats.chisquare - - -def mannwhitneyu(x,y, use_continuity=True): - """ - Computes the Mann-Whitney statistic - - Missing values in `x` and/or `y` are discarded. - - Parameters - ---------- - x : sequence - Input - y : sequence - Input - use_continuity : {True, False}, optional - Whether a continuity correction (1/2.) should be taken into account. - - Returns - ------- - u : float - The Mann-Whitney statistics - prob : float - Approximate p-value assuming a normal distribution. - - """ - x = ma.asarray(x).compressed().view(ndarray) - y = ma.asarray(y).compressed().view(ndarray) - ranks = rankdata(np.concatenate([x,y])) - (nx, ny) = (len(x), len(y)) - nt = nx + ny - U = ranks[:nx].sum() - nx*(nx+1)/2. - U = max(U, nx*ny - U) - u = nx*ny - U - - mu = (nx*ny)/2. - sigsq = (nt**3 - nt)/12. - ties = count_tied_groups(ranks) - sigsq -= np.sum(v*(k**3-k) for (k,v) in iteritems(ties))/12. - sigsq *= nx*ny/float(nt*(nt-1)) - - if use_continuity: - z = (U - 1/2. - mu) / ma.sqrt(sigsq) - else: - z = (U - mu) / ma.sqrt(sigsq) - - prob = special.erfc(abs(z)/np.sqrt(2)) - return (u, prob) - - -def kruskalwallis(*args): - output = argstoarray(*args) - ranks = ma.masked_equal(rankdata(output, use_missing=False), 0) - sumrk = ranks.sum(-1) - ngrp = ranks.count(-1) - ntot = ranks.count() - H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1) - # Tie correction - ties = count_tied_groups(ranks) - T = 1. - np.sum(v*(k**3-k) for (k,v) in iteritems(ties))/float(ntot**3-ntot) - if T == 0: - raise ValueError('All numbers are identical in kruskal') - - H /= T - df = len(output) - 1 - prob = stats.chisqprob(H,df) - return (H, prob) -kruskal = kruskalwallis -kruskalwallis.__doc__ = stats.kruskal.__doc__ - - -def ks_twosamp(data1, data2, alternative="two-sided"): - """ - Computes the Kolmogorov-Smirnov test on two samples. - - Missing values are discarded. - - Parameters - ---------- - data1 : array_like - First data set - data2 : array_like - Second data set - alternative : {'two-sided', 'less', 'greater'}, optional - Indicates the alternative hypothesis. Default is 'two-sided'. - - Returns - ------- - d : float - Value of the Kolmogorov Smirnov test - p : float - Corresponding p-value. - - """ - (data1, data2) = (ma.asarray(data1), ma.asarray(data2)) - (n1, n2) = (data1.count(), data2.count()) - n = (n1*n2/float(n1+n2)) - mix = ma.concatenate((data1.compressed(), data2.compressed())) - mixsort = mix.argsort(kind='mergesort') - csum = np.where(mixsort < n1, 1./n1, -1./n2).cumsum() - # Check for ties - if len(np.unique(mix)) < (n1+n2): - csum = csum[np.r_[np.diff(mix[mixsort]).nonzero()[0],-1]] - - alternative = str(alternative).lower()[0] - if alternative == 't': - d = ma.abs(csum).max() - prob = special.kolmogorov(np.sqrt(n)*d) - elif alternative == 'l': - d = -csum.min() - prob = np.exp(-2*n*d**2) - elif alternative == 'g': - d = csum.max() - prob = np.exp(-2*n*d**2) - else: - raise ValueError("Invalid value for the alternative hypothesis: " - "should be in 'two-sided', 'less' or 'greater'") - - return (d, prob) -ks_2samp = ks_twosamp - - -def ks_twosamp_old(data1, data2): - """ Computes the Kolmogorov-Smirnov statistic on 2 samples. - - Returns - ------- - KS D-value, p-value - - """ - (data1, data2) = [ma.asarray(d).compressed() for d in (data1,data2)] - return stats.ks_2samp(data1,data2) - - -def threshold(a, threshmin=None, threshmax=None, newval=0): - """ - Clip array to a given value. - - Similar to numpy.clip(), except that values less than `threshmin` or - greater than `threshmax` are replaced by `newval`, instead of by - `threshmin` and `threshmax` respectively. - - Parameters - ---------- - a : ndarray - Input data - threshmin : {None, float}, optional - Lower threshold. If None, set to the minimum value. - threshmax : {None, float}, optional - Upper threshold. If None, set to the maximum value. - newval : {0, float}, optional - Value outside the thresholds. - - Returns - ------- - threshold : ndarray - Returns `a`, with values less then `threshmin` and values greater - `threshmax` replaced with `newval`. - - """ - a = ma.array(a, copy=True) - mask = np.zeros(a.shape, dtype=bool) - if threshmin is not None: - mask |= (a < threshmin).filled(False) - - if threshmax is not None: - mask |= (a > threshmax).filled(False) - - a[mask] = newval - return a - - -def trima(a, limits=None, inclusive=(True,True)): - """ - Trims an array by masking the data outside some given limits. - - Returns a masked version of the input array. - - Parameters - ---------- - a : array_like - Input array. - limits : {None, tuple}, optional - Tuple of (lower limit, upper limit) in absolute values. - Values of the input array lower (greater) than the lower (upper) limit - will be masked. A limit is None indicates an open interval. - inclusive : (bool, bool) tuple, optional - Tuple of (lower flag, upper flag), indicating whether values exactly - equal to the lower (upper) limit are allowed. - - """ - a = ma.asarray(a) - a.unshare_mask() - if (limits is None) or (limits == (None, None)): - return a - - (lower_lim, upper_lim) = limits - (lower_in, upper_in) = inclusive - condition = False - if lower_lim is not None: - if lower_in: - condition |= (a < lower_lim) - else: - condition |= (a <= lower_lim) - - if upper_lim is not None: - if upper_in: - condition |= (a > upper_lim) - else: - condition |= (a >= upper_lim) - - a[condition.filled(True)] = masked - return a - - -def trimr(a, limits=None, inclusive=(True, True), axis=None): - """ - Trims an array by masking some proportion of the data on each end. - Returns a masked version of the input array. - - Parameters - ---------- - a : sequence - Input array. - limits : {None, tuple}, optional - Tuple of the percentages to cut on each side of the array, with respect - to the number of unmasked data, as floats between 0. and 1. - Noting n the number of unmasked data before trimming, the - (n*limits[0])th smallest data and the (n*limits[1])th largest data are - masked, and the total number of unmasked data after trimming is - n*(1.-sum(limits)). The value of one limit can be set to None to - indicate an open interval. - inclusive : {(True,True) tuple}, optional - Tuple of flags indicating whether the number of data being masked on - the left (right) end should be truncated (True) or rounded (False) to - integers. - axis : {None,int}, optional - Axis along which to trim. If None, the whole array is trimmed, but its - shape is maintained. - - """ - def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive): - n = a.count() - idx = a.argsort() - if low_limit: - if low_inclusive: - lowidx = int(low_limit*n) - else: - lowidx = np.round(low_limit*n) - a[idx[:lowidx]] = masked - if up_limit is not None: - if up_inclusive: - upidx = n - int(n*up_limit) - else: - upidx = n - np.round(n*up_limit) - a[idx[upidx:]] = masked - return a - - a = ma.asarray(a) - a.unshare_mask() - if limits is None: - return a - - # Check the limits - (lolim, uplim) = limits - errmsg = "The proportion to cut from the %s should be between 0. and 1." - if lolim is not None: - if lolim > 1. or lolim < 0: - raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim) - if uplim is not None: - if uplim > 1. or uplim < 0: - raise ValueError(errmsg % 'end' + "(got %s)" % uplim) - - (loinc, upinc) = inclusive - - if axis is None: - shp = a.shape - return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp) - else: - return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc) - -trimdoc = """ - Parameters - ---------- - a : sequence - Input array - limits : {None, tuple}, optional - If `relative` is False, tuple (lower limit, upper limit) in absolute values. - Values of the input array lower (greater) than the lower (upper) limit are - masked. - - If `relative` is True, tuple (lower percentage, upper percentage) to cut - on each side of the array, with respect to the number of unmasked data. - - Noting n the number of unmasked data before trimming, the (n*limits[0])th - smallest data and the (n*limits[1])th largest data are masked, and the - total number of unmasked data after trimming is n*(1.-sum(limits)) - In each case, the value of one limit can be set to None to indicate an - open interval. - - If limits is None, no trimming is performed - inclusive : {(bool, bool) tuple}, optional - If `relative` is False, tuple indicating whether values exactly equal - to the absolute limits are allowed. - If `relative` is True, tuple indicating whether the number of data - being masked on each side should be rounded (True) or truncated - (False). - relative : bool, optional - Whether to consider the limits as absolute values (False) or proportions - to cut (True). - axis : int, optional - Axis along which to trim. -""" - - -def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None): - """ - Trims an array by masking the data outside some given limits. - - Returns a masked version of the input array. - - %s - - Examples - -------- - >>> z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10] - >>> trim(z,(3,8)) - [--,--, 3, 4, 5, 6, 7, 8,--,--] - >>> trim(z,(0.1,0.2),relative=True) - [--, 2, 3, 4, 5, 6, 7, 8,--,--] - - """ - if relative: - return trimr(a, limits=limits, inclusive=inclusive, axis=axis) - else: - return trima(a, limits=limits, inclusive=inclusive) - -if trim.__doc__ is not None: - trim.__doc__ = trim.__doc__ % trimdoc - - -def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None): - """ - Trims the smallest and largest data values. - - Trims the `data` by masking the ``int(proportiontocut * n)`` smallest and - ``int(proportiontocut * n)`` largest values of data along the given axis, - where n is the number of unmasked values before trimming. - - Parameters - ---------- - data : ndarray - Data to trim. - proportiontocut : float, optional - Percentage of trimming (as a float between 0 and 1). - If n is the number of unmasked values before trimming, the number of - values after trimming is ``(1 - 2*proportiontocut) * n``. - Default is 0.2. - inclusive : {(bool, bool) tuple}, optional - Tuple indicating whether the number of data being masked on each side - should be rounded (True) or truncated (False). - axis : int, optional - Axis along which to perform the trimming. - If None, the input array is first flattened. - - """ - return trimr(data, limits=(proportiontocut,proportiontocut), - inclusive=inclusive, axis=axis) - - -def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True), - axis=None): - """ - Trims the data by masking values from one tail. - - Parameters - ---------- - data : array_like - Data to trim. - proportiontocut : float, optional - Percentage of trimming. If n is the number of unmasked values - before trimming, the number of values after trimming is - ``(1 - proportiontocut) * n``. Default is 0.2. - tail : {'left','right'}, optional - If 'left' the `proportiontocut` lowest values will be masked. - If 'right' the `proportiontocut` highest values will be masked. - Default is 'left'. - inclusive : {(bool, bool) tuple}, optional - Tuple indicating whether the number of data being masked on each side - should be rounded (True) or truncated (False). Default is - (True, True). - axis : int, optional - Axis along which to perform the trimming. - If None, the input array is first flattened. Default is None. - - Returns - ------- - trimtail : ndarray - Returned array of same shape as `data` with masked tail values. - - """ - tail = str(tail).lower()[0] - if tail == 'l': - limits = (proportiontocut,None) - elif tail == 'r': - limits = (None, proportiontocut) - else: - raise TypeError("The tail argument should be in ('left','right')") - - return trimr(data, limits=limits, axis=axis, inclusive=inclusive) - -trim1 = trimtail - - -def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, - axis=None): - """Returns the trimmed mean of the data along the given axis. - - %s - - """ % trimdoc - if (not isinstance(limits,tuple)) and isinstance(limits,float): - limits = (limits, limits) - if relative: - return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis) - else: - return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis) - - -def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, - axis=None, ddof=0): - """Returns the trimmed variance of the data along the given axis. - - %s - ddof : {0,integer}, optional - Means Delta Degrees of Freedom. The denominator used during computations - is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un- - biased estimate of the variance. - - """ % trimdoc - if (not isinstance(limits,tuple)) and isinstance(limits,float): - limits = (limits, limits) - if relative: - out = trimr(a,limits=limits, inclusive=inclusive,axis=axis) - else: - out = trima(a,limits=limits,inclusive=inclusive) - - return out.var(axis=axis, ddof=ddof) - - -def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, - axis=None, ddof=0): - """Returns the trimmed standard deviation of the data along the given axis. - - %s - ddof : {0,integer}, optional - Means Delta Degrees of Freedom. The denominator used during computations - is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un- - biased estimate of the variance. - - """ % trimdoc - if (not isinstance(limits,tuple)) and isinstance(limits,float): - limits = (limits, limits) - if relative: - out = trimr(a,limits=limits,inclusive=inclusive,axis=axis) - else: - out = trima(a,limits=limits,inclusive=inclusive) - return out.std(axis=axis,ddof=ddof) - - -def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None): - """ - Returns the standard error of the trimmed mean along the given axis. - - Parameters - ---------- - a : sequence - Input array - limits : {(0.1,0.1), tuple of float}, optional - tuple (lower percentage, upper percentage) to cut on each side of the - array, with respect to the number of unmasked data. - - If n is the number of unmasked data before trimming, the values - smaller than ``n * limits[0]`` and the values larger than - ``n * `limits[1]`` are masked, and the total number of unmasked - data after trimming is ``n * (1.-sum(limits))``. In each case, - the value of one limit can be set to None to indicate an open interval. - If `limits` is None, no trimming is performed. - inclusive : {(bool, bool) tuple} optional - Tuple indicating whether the number of data being masked on each side - should be rounded (True) or truncated (False). - axis : int, optional - Axis along which to trim. - - Returns - ------- - trimmed_stde : scalar or ndarray - - """ - def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive): - "Returns the standard error of the trimmed mean for a 1D input data." - n = a.count() - idx = a.argsort() - if low_limit: - if low_inclusive: - lowidx = int(low_limit*n) - else: - lowidx = np.round(low_limit*n) - a[idx[:lowidx]] = masked - if up_limit is not None: - if up_inclusive: - upidx = n - int(n*up_limit) - else: - upidx = n - np.round(n*up_limit) - a[idx[upidx:]] = masked - a[idx[:lowidx]] = a[idx[lowidx]] - a[idx[upidx:]] = a[idx[upidx-1]] - winstd = a.std(ddof=1) - return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a))) - - a = ma.array(a, copy=True, subok=True) - a.unshare_mask() - if limits is None: - return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis)) - if (not isinstance(limits,tuple)) and isinstance(limits,float): - limits = (limits, limits) - - # Check the limits - (lolim, uplim) = limits - errmsg = "The proportion to cut from the %s should be between 0. and 1." - if lolim is not None: - if lolim > 1. or lolim < 0: - raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim) - if uplim is not None: - if uplim > 1. or uplim < 0: - raise ValueError(errmsg % 'end' + "(got %s)" % uplim) - - (loinc, upinc) = inclusive - if (axis is None): - return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc) - else: - if a.ndim > 2: - raise ValueError("Array 'a' must be at most two dimensional, but got a.ndim = %d" % a.ndim) - return ma.apply_along_axis(_trimmed_stde_1D, axis, a, - lolim,uplim,loinc,upinc) - - -def tmean(a, limits=None, inclusive=(True,True)): - return trima(a, limits=limits, inclusive=inclusive).mean() -tmean.__doc__ = stats.tmean.__doc__ - - -def tvar(a, limits=None, inclusive=(True,True)): - a = a.astype(float).ravel() - if limits is None: - n = (~a.mask).sum() # todo: better way to do that? - r = trima(a, limits=limits, inclusive=inclusive).var() * (n/(n-1.)) - else: - raise ValueError('mstats.tvar() with limits not implemented yet so far') - - return r -tvar.__doc__ = stats.tvar.__doc__ - - -def tmin(a, lowerlimit=None, axis=0, inclusive=True): - a, axis = _chk_asarray(a, axis) - am = trima(a, (lowerlimit, None), (inclusive, False)) - return ma.minimum.reduce(am, axis) -tmin.__doc__ = stats.tmin.__doc__ - - -def tmax(a, upperlimit, axis=0, inclusive=True): - a, axis = _chk_asarray(a, axis) - am = trima(a, (None, upperlimit), (False, inclusive)) - return ma.maximum.reduce(am, axis) -tmax.__doc__ = stats.tmax.__doc__ - - -def tsem(a, limits=None, inclusive=(True,True)): - a = ma.asarray(a).ravel() - if limits is None: - n = float(a.count()) - return a.std(ddof=1)/ma.sqrt(n) - - am = trima(a.ravel(), limits, inclusive) - sd = np.sqrt(am.var(ddof=1)) - return sd / np.sqrt(am.count()) -tsem.__doc__ = stats.tsem.__doc__ - - -def winsorize(a, limits=None, inclusive=(True, True), inplace=False, - axis=None): - """Returns a Winsorized version of the input array. - - The (limits[0])th lowest values are set to the (limits[0])th percentile, - and the (limits[1])th highest values are set to the (1 - limits[1])th - percentile. - Masked values are skipped. - - - Parameters - ---------- - a : sequence - Input array. - limits : {None, tuple of float}, optional - Tuple of the percentages to cut on each side of the array, with respect - to the number of unmasked data, as floats between 0. and 1. - Noting n the number of unmasked data before trimming, the - (n*limits[0])th smallest data and the (n*limits[1])th largest data are - masked, and the total number of unmasked data after trimming - is n*(1.-sum(limits)) The value of one limit can be set to None to - indicate an open interval. - inclusive : {(True, True) tuple}, optional - Tuple indicating whether the number of data being masked on each side - should be rounded (True) or truncated (False). - inplace : {False, True}, optional - Whether to winsorize in place (True) or to use a copy (False) - axis : {None, int}, optional - Axis along which to trim. If None, the whole array is trimmed, but its - shape is maintained. - - Notes - ----- - This function is applied to reduce the effect of possibly spurious outliers - by limiting the extreme values. - - """ - def _winsorize1D(a, low_limit, up_limit, low_include, up_include): - n = a.count() - idx = a.argsort() - if low_limit: - if low_include: - lowidx = int(low_limit * n) - else: - lowidx = np.round(low_limit * n) - a[idx[:lowidx]] = a[idx[lowidx]] - if up_limit is not None: - if up_include: - upidx = n - int(n * up_limit) - else: - upidx = n - np.round(n * up_limit) - a[idx[upidx:]] = a[idx[upidx - 1]] - return a - - # We are going to modify a: better make a copy - a = ma.array(a, copy=np.logical_not(inplace)) - - if limits is None: - return a - if (not isinstance(limits, tuple)) and isinstance(limits, float): - limits = (limits, limits) - - # Check the limits - (lolim, uplim) = limits - errmsg = "The proportion to cut from the %s should be between 0. and 1." - if lolim is not None: - if lolim > 1. or lolim < 0: - raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim) - if uplim is not None: - if uplim > 1. or uplim < 0: - raise ValueError(errmsg % 'end' + "(got %s)" % uplim) - - (loinc, upinc) = inclusive - - if axis is None: - shp = a.shape - return _winsorize1D(a.ravel(), lolim, uplim, loinc, upinc).reshape(shp) - else: - return ma.apply_along_axis(_winsorize1D, axis, a, lolim, uplim, loinc, - upinc) - - -def moment(a, moment=1, axis=0): - a, axis = _chk_asarray(a, axis) - if moment == 1: - # By definition the first moment about the mean is 0. - shape = list(a.shape) - del shape[axis] - if shape: - # return an actual array of the appropriate shape - return np.zeros(shape, dtype=float) - else: - # the input was 1D, so return a scalar instead of a rank-0 array - return np.float64(0.0) - else: - mn = ma.expand_dims(a.mean(axis=axis), axis) - s = ma.power((a-mn), moment) - return s.mean(axis=axis) -moment.__doc__ = stats.moment.__doc__ - - -def variation(a, axis=0): - a, axis = _chk_asarray(a, axis) - return a.std(axis)/a.mean(axis) -variation.__doc__ = stats.variation.__doc__ - - -def skew(a, axis=0, bias=True): - a, axis = _chk_asarray(a,axis) - n = a.count(axis) - m2 = moment(a, 2, axis) - m3 = moment(a, 3, axis) - olderr = np.seterr(all='ignore') - try: - vals = ma.where(m2 == 0, 0, m3 / m2**1.5) - finally: - np.seterr(**olderr) - - if not bias: - can_correct = (n > 2) & (m2 > 0) - if can_correct.any(): - m2 = np.extract(can_correct, m2) - m3 = np.extract(can_correct, m3) - nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5 - np.place(vals, can_correct, nval) - return vals -skew.__doc__ = stats.skew.__doc__ - - -def kurtosis(a, axis=0, fisher=True, bias=True): - a, axis = _chk_asarray(a, axis) - m2 = moment(a, 2, axis) - m4 = moment(a, 4, axis) - olderr = np.seterr(all='ignore') - try: - vals = ma.where(m2 == 0, 0, m4 / m2**2.0) - finally: - np.seterr(**olderr) - - if not bias: - n = a.count(axis) - can_correct = (n > 3) & (m2 is not ma.masked and m2 > 0) - if can_correct.any(): - n = np.extract(can_correct, n) - m2 = np.extract(can_correct, m2) - m4 = np.extract(can_correct, m4) - nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0) - np.place(vals, can_correct, nval+3.0) - if fisher: - return vals - 3 - else: - return vals -kurtosis.__doc__ = stats.kurtosis.__doc__ - - -def describe(a, axis=0,ddof=0): - """ - Computes several descriptive statistics of the passed array. - - Parameters - ---------- - a : array - - axis : int or None - - ddof : int - degree of freedom (default 0); note that default ddof is different - from the same routine in stats.describe - - Returns - ------- - n : int - (size of the data (discarding missing values) - mm : (int, int) - min, max - - arithmetic mean : float - - unbiased variance : float - - biased skewness : float - - biased kurtosis : float - - Examples - -------- - - >>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1]) - >>> describe(ma) - (array(3), - (0, 2), - 1.0, - 1.0, - masked_array(data = 0.0, - mask = False, - fill_value = 1e+20) - , - -1.5) - - """ - a, axis = _chk_asarray(a, axis) - n = a.count(axis) - mm = (ma.minimum.reduce(a), ma.maximum.reduce(a)) - m = a.mean(axis) - v = a.var(axis,ddof=ddof) - sk = skew(a, axis) - kurt = kurtosis(a, axis) - return n, mm, m, v, sk, kurt - - -def stde_median(data, axis=None): - """Returns the McKean-Schrader estimate of the standard error of the sample - median along the given axis. masked values are discarded. - - Parameters - ---------- - data : ndarray - Data to trim. - axis : {None,int}, optional - Axis along which to perform the trimming. - If None, the input array is first flattened. - - """ - def _stdemed_1D(data): - data = np.sort(data.compressed()) - n = len(data) - z = 2.5758293035489004 - k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0)) - return ((data[n-k] - data[k-1])/(2.*z)) - - data = ma.array(data, copy=False, subok=True) - if (axis is None): - return _stdemed_1D(data) - else: - if data.ndim > 2: - raise ValueError("Array 'data' must be at most two dimensional, " - "but got data.ndim = %d" % data.ndim) - return ma.apply_along_axis(_stdemed_1D, axis, data) - - -def skewtest(a, axis=0): - a, axis = _chk_asarray(a, axis) - if axis is None: - a = a.ravel() - axis = 0 - b2 = skew(a,axis) - n = a.count(axis) - if np.min(n) < 8: - raise ValueError( - "skewtest is not valid with less than 8 samples; %i samples" - " were given." % np.min(n)) - - y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2))) - beta2 = (3.0*(n*n+27*n-70)*(n+1)*(n+3)) / ((n-2.0)*(n+5)*(n+7)*(n+9)) - W2 = -1 + ma.sqrt(2*(beta2-1)) - delta = 1/ma.sqrt(0.5*ma.log(W2)) - alpha = ma.sqrt(2.0/(W2-1)) - y = ma.where(y == 0, 1, y) - Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1)) - return Z, 2 * distributions.norm.sf(np.abs(Z)) -skewtest.__doc__ = stats.skewtest.__doc__ - - -def kurtosistest(a, axis=0): - a, axis = _chk_asarray(a, axis) - n = a.count(axis=axis) - if np.min(n) < 5: - raise ValueError( - "kurtosistest requires at least 5 observations; %i observations" - " were given." % np.min(n)) - if np.min(n) < 20: - warnings.warn( - "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" % - np.min(n)) - - b2 = kurtosis(a, axis, fisher=False) - E = 3.0*(n-1) / (n+1) - varb2 = 24.0*n*(n-2.)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5)) - x = (b2-E)/ma.sqrt(varb2) - sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) / - (n*(n-2)*(n-3))) - A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2))) - term1 = 1 - 2./(9.0*A) - denom = 1 + x*ma.sqrt(2/(A-4.0)) - if np.ma.isMaskedArray(denom): - # For multi-dimensional array input - denom[denom < 0] = masked - elif denom < 0: - denom = masked - - term2 = ma.power((1-2.0/A)/denom,1/3.0) - Z = (term1 - term2) / np.sqrt(2/(9.0*A)) - return Z, 2 * distributions.norm.sf(np.abs(Z)) -kurtosistest.__doc__ = stats.kurtosistest.__doc__ - - -def normaltest(a, axis=0): - a, axis = _chk_asarray(a, axis) - s, _ = skewtest(a, axis) - k, _ = kurtosistest(a, axis) - k2 = s*s + k*k - return k2, stats.chisqprob(k2,2) -normaltest.__doc__ = stats.normaltest.__doc__ - - -def mquantiles(a, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None, - limit=()): - """ - Computes empirical quantiles for a data array. - - Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``, - where ``x[j]`` is the j-th order statistic, and gamma is a function of - ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and - ``g = n*p + m - j``. - - Reinterpreting the above equations to compare to **R** lead to the - equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)`` - - Typical values of (alphap,betap) are: - - (0,1) : ``p(k) = k/n`` : linear interpolation of cdf - (**R** type 4) - - (.5,.5) : ``p(k) = (k - 1/2.)/n`` : piecewise linear function - (**R** type 5) - - (0,0) : ``p(k) = k/(n+1)`` : - (**R** type 6) - - (1,1) : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])]. - (**R** type 7, **R** default) - - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])]. - The resulting quantile estimates are approximately median-unbiased - regardless of the distribution of x. - (**R** type 8) - - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom. - The resulting quantile estimates are approximately unbiased - if x is normally distributed - (**R** type 9) - - (.4,.4) : approximately quantile unbiased (Cunnane) - - (.35,.35): APL, used with PWM - - Parameters - ---------- - a : array_like - Input data, as a sequence or array of dimension at most 2. - prob : array_like, optional - List of quantiles to compute. - alphap : float, optional - Plotting positions parameter, default is 0.4. - betap : float, optional - Plotting positions parameter, default is 0.4. - axis : int, optional - Axis along which to perform the trimming. - If None (default), the input array is first flattened. - limit : tuple - Tuple of (lower, upper) values. - Values of `a` outside this open interval are ignored. - - Returns - ------- - mquantiles : MaskedArray - An array containing the calculated quantiles. - - Notes - ----- - This formulation is very similar to **R** except the calculation of - ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined - with each type. - - References - ---------- - .. [1] *R* statistical software: http://www.r-project.org/ - .. [2] *R* ``quantile`` function: - http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html - - Examples - -------- - >>> from scipy.stats.mstats import mquantiles - >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.]) - >>> mquantiles(a) - array([ 19.2, 40. , 42.8]) - - Using a 2D array, specifying axis and limit. - - >>> data = np.array([[ 6., 7., 1.], - [ 47., 15., 2.], - [ 49., 36., 3.], - [ 15., 39., 4.], - [ 42., 40., -999.], - [ 41., 41., -999.], - [ 7., -999., -999.], - [ 39., -999., -999.], - [ 43., -999., -999.], - [ 40., -999., -999.], - [ 36., -999., -999.]]) - >>> mquantiles(data, axis=0, limit=(0, 50)) - array([[ 19.2 , 14.6 , 1.45], - [ 40. , 37.5 , 2.5 ], - [ 42.8 , 40.05, 3.55]]) - - >>> data[:, 2] = -999. - >>> mquantiles(data, axis=0, limit=(0, 50)) - masked_array(data = - [[19.2 14.6 --] - [40.0 37.5 --] - [42.8 40.05 --]], - mask = - [[False False True] - [False False True] - [False False True]], - fill_value = 1e+20) - - """ - def _quantiles1D(data,m,p): - x = np.sort(data.compressed()) - n = len(x) - if n == 0: - return ma.array(np.empty(len(p), dtype=float), mask=True) - elif n == 1: - return ma.array(np.resize(x, p.shape), mask=nomask) - aleph = (n*p + m) - k = np.floor(aleph.clip(1, n-1)).astype(int) - gamma = (aleph-k).clip(0,1) - return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()] - - data = ma.array(a, copy=False) - if data.ndim > 2: - raise TypeError("Array should be 2D at most !") - - if limit: - condition = (limit[0] < data) & (data < limit[1]) - data[~condition.filled(True)] = masked - - p = np.array(prob, copy=False, ndmin=1) - m = alphap + p*(1.-alphap-betap) - # Computes quantiles along axis (or globally) - if (axis is None): - return _quantiles1D(data, m, p) - - return ma.apply_along_axis(_quantiles1D, axis, data, m, p) - - -def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4): - """Calculate the score at the given 'per' percentile of the - sequence a. For example, the score at per=50 is the median. - - This function is a shortcut to mquantile - - """ - if (per < 0) or (per > 100.): - raise ValueError("The percentile should be between 0. and 100. !" - " (got %s)" % per) - - return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap, - limit=limit, axis=0).squeeze() - - -def plotting_positions(data, alpha=0.4, beta=0.4): - """ - Returns plotting positions (or empirical percentile points) for the data. - - Plotting positions are defined as ``(i-alpha)/(n+1-alpha-beta)``, where: - - i is the rank order statistics - - n is the number of unmasked values along the given axis - - `alpha` and `beta` are two parameters. - - Typical values for `alpha` and `beta` are: - - (0,1) : ``p(k) = k/n``, linear interpolation of cdf (R, type 4) - - (.5,.5) : ``p(k) = (k-1/2.)/n``, piecewise linear function - (R, type 5) - - (0,0) : ``p(k) = k/(n+1)``, Weibull (R type 6) - - (1,1) : ``p(k) = (k-1)/(n-1)``, in this case, - ``p(k) = mode[F(x[k])]``. That's R default (R type 7) - - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``, then - ``p(k) ~ median[F(x[k])]``. - The resulting quantile estimates are approximately median-unbiased - regardless of the distribution of x. (R type 8) - - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``, Blom. - The resulting quantile estimates are approximately unbiased - if x is normally distributed (R type 9) - - (.4,.4) : approximately quantile unbiased (Cunnane) - - (.35,.35): APL, used with PWM - - (.3175, .3175): used in scipy.stats.probplot - - Parameters - ---------- - data : array_like - Input data, as a sequence or array of dimension at most 2. - alpha : float, optional - Plotting positions parameter. Default is 0.4. - beta : float, optional - Plotting positions parameter. Default is 0.4. - - Returns - ------- - positions : MaskedArray - The calculated plotting positions. - - """ - data = ma.array(data, copy=False).reshape(1,-1) - n = data.count() - plpos = np.empty(data.size, dtype=float) - plpos[n:] = 0 - plpos[data.argsort()[:n]] = ((np.arange(1, n+1) - alpha) / - (n + 1.0 - alpha - beta)) - return ma.array(plpos, mask=data._mask) - -meppf = plotting_positions - - -def obrientransform(*args): - """ - Computes a transform on input data (any number of columns). Used to - test for homogeneity of variance prior to running one-way stats. Each - array in *args is one level of a factor. If an F_oneway() run on the - transformed data and found significant, variances are unequal. From - Maxwell and Delaney, p.112. - - Returns: transformed data for use in an ANOVA - """ - data = argstoarray(*args).T - v = data.var(axis=0,ddof=1) - m = data.mean(0) - n = data.count(0).astype(float) - # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2)) - data -= m - data **= 2 - data *= (n-1.5)*n - data -= 0.5*v*(n-1) - data /= (n-1.)*(n-2.) - if not ma.allclose(v,data.mean(0)): - raise ValueError("Lack of convergence in obrientransform.") - - return data - - -def signaltonoise(data, axis=0): - """Calculates the signal-to-noise ratio, as the ratio of the mean over - standard deviation along the given axis. - - Parameters - ---------- - data : sequence - Input data - axis : {0, int}, optional - Axis along which to compute. If None, the computation is performed - on a flat version of the array. - """ - data = ma.array(data, copy=False) - m = data.mean(axis) - sd = data.std(axis, ddof=0) - return m/sd - - -def sem(a, axis=0, ddof=1): - """ - Calculates the standard error of the mean of the input array. - - Also sometimes called standard error of measurement. - - Parameters - ---------- - a : array_like - An array containing the values for which the standard error is - returned. - axis : int or None, optional. - If axis is None, ravel `a` first. If axis is an integer, this will be - the axis over which to operate. Defaults to 0. - ddof : int, optional - Delta degrees-of-freedom. How many degrees of freedom to adjust - for bias in limited samples relative to the population estimate - of variance. Defaults to 1. - - Returns - ------- - s : ndarray or float - The standard error of the mean in the sample(s), along the input axis. - - Notes - ----- - The default value for `ddof` changed in scipy 0.15.0 to be consistent with - `stats.sem` as well as with the most common definition used (like in the R - documentation). - - Examples - -------- - Find standard error along the first axis: - - >>> from scipy import stats - >>> a = np.arange(20).reshape(5,4) - >>> stats.sem(a) - array([ 2.8284, 2.8284, 2.8284, 2.8284]) - - Find standard error across the whole array, using n degrees of freedom: - - >>> stats.sem(a, axis=None, ddof=0) - 1.2893796958227628 - - """ - a, axis = _chk_asarray(a, axis) - n = a.count(axis=axis) - s = a.std(axis=axis, ddof=ddof) / ma.sqrt(n) - return s - - -zmap = stats.zmap -zscore = stats.zscore - - -def f_oneway(*args): - """ - Performs a 1-way ANOVA, returning an F-value and probability given - any number of groups. From Heiman, pp.394-7. - - Usage: ``f_oneway(*args)``, where ``*args`` is 2 or more arrays, - one per treatment group. - Returns: f-value, probability - - """ - # Construct a single array of arguments: each row is a group - data = argstoarray(*args) - ngroups = len(data) - ntot = data.count() - sstot = (data**2).sum() - (data.sum())**2/float(ntot) - ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum() - sswg = sstot-ssbg - dfbg = ngroups-1 - dfwg = ntot - ngroups - msb = ssbg/float(dfbg) - msw = sswg/float(dfwg) - f = msb/msw - prob = special.fdtrc(dfbg, dfwg, f) # equivalent to stats.f.sf - return f, prob - - -def f_value_wilks_lambda(ER, EF, dfnum, dfden, a, b): - """Calculation of Wilks lambda F-statistic for multivariate data, per - Maxwell & Delaney p.657. - """ - ER = ma.array(ER, copy=False, ndmin=2) - EF = ma.array(EF, copy=False, ndmin=2) - if ma.getmask(ER).any() or ma.getmask(EF).any(): - raise NotImplementedError("Not implemented when the inputs " - "have missing data") - - lmbda = np.linalg.det(EF) / np.linalg.det(ER) - q = ma.sqrt(((a-1)**2*(b-1)**2 - 2) / ((a-1)**2 + (b-1)**2 - 5)) - q = ma.filled(q, 1) - n_um = (1 - lmbda**(1.0/q))*(a-1)*(b-1) - d_en = lmbda**(1.0/q) / (n_um*q - 0.5*(a-1)*(b-1) + 1) - return n_um / d_en - - -def friedmanchisquare(*args): - """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA. - This function calculates the Friedman Chi-square test for repeated measures - and returns the result, along with the associated probability value. - - Each input is considered a given group. Ideally, the number of treatments - among each group should be equal. If this is not the case, only the first - n treatments are taken into account, where n is the number of treatments - of the smallest group. - If a group has some missing values, the corresponding treatments are masked - in the other groups. - The test statistic is corrected for ties. - - Masked values in one group are propagated to the other groups. - - Returns: chi-square statistic, associated p-value - """ - data = argstoarray(*args).astype(float) - k = len(data) - if k < 3: - raise ValueError("Less than 3 groups (%i): " % k + - "the Friedman test is NOT appropriate.") - - ranked = ma.masked_values(rankdata(data, axis=0), 0) - if ranked._mask is not nomask: - ranked = ma.mask_cols(ranked) - ranked = ranked.compressed().reshape(k,-1).view(ndarray) - else: - ranked = ranked._data - (k,n) = ranked.shape - # Ties correction - repeats = np.array([find_repeats(_) for _ in ranked.T], dtype=object) - ties = repeats[repeats.nonzero()].reshape(-1,2)[:,-1].astype(int) - tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k)) - - ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2) - chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction - return chisq, stats.chisqprob(chisq,k-1) diff --git a/wafo/stats/mstats_extras.py b/wafo/stats/mstats_extras.py deleted file mode 100644 index 1a0ddcc..0000000 --- a/wafo/stats/mstats_extras.py +++ /dev/null @@ -1,451 +0,0 @@ -""" -Additional statistics functions with support for masked arrays. - -""" - -# Original author (2007): Pierre GF Gerard-Marchant - - -from __future__ import division, print_function, absolute_import - - -__all__ = ['compare_medians_ms', - 'hdquantiles', 'hdmedian', 'hdquantiles_sd', - 'idealfourths', - 'median_cihs','mjci','mquantiles_cimj', - 'rsh', - 'trimmed_mean_ci',] - - -import numpy as np -from numpy import float_, int_, ndarray - -import numpy.ma as ma -from numpy.ma import MaskedArray - -from . import mstats_basic as mstats - -from scipy.stats.distributions import norm, beta, t, binom - - -def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,): - """ - Computes quantile estimates with the Harrell-Davis method. - - The quantile estimates are calculated as a weighted linear combination - of order statistics. - - Parameters - ---------- - data : array_like - Data array. - prob : sequence - Sequence of quantiles to compute. - axis : int - Axis along which to compute the quantiles. If None, use a flattened - array. - var : boolean - Whether to return the variance of the estimate. - - Returns - ------- - hdquantiles : MaskedArray - A (p,) array of quantiles (if `var` is False), or a (2,p) array of - quantiles and variances (if `var` is True), where ``p`` is the - number of quantiles. - - """ - def _hd_1D(data,prob,var): - "Computes the HD quantiles for a 1D array. Returns nan for invalid data." - xsorted = np.squeeze(np.sort(data.compressed().view(ndarray))) - # Don't use length here, in case we have a numpy scalar - n = xsorted.size - - hd = np.empty((2,len(prob)), float_) - if n < 2: - hd.flat = np.nan - if var: - return hd - return hd[0] - - v = np.arange(n+1) / float(n) - betacdf = beta.cdf - for (i,p) in enumerate(prob): - _w = betacdf(v, (n+1)*p, (n+1)*(1-p)) - w = _w[1:] - _w[:-1] - hd_mean = np.dot(w, xsorted) - hd[0,i] = hd_mean - # - hd[1,i] = np.dot(w, (xsorted-hd_mean)**2) - # - hd[0, prob == 0] = xsorted[0] - hd[0, prob == 1] = xsorted[-1] - if var: - hd[1, prob == 0] = hd[1, prob == 1] = np.nan - return hd - return hd[0] - # Initialization & checks - data = ma.array(data, copy=False, dtype=float_) - p = np.array(prob, copy=False, ndmin=1) - # Computes quantiles along axis (or globally) - if (axis is None) or (data.ndim == 1): - result = _hd_1D(data, p, var) - else: - if data.ndim > 2: - raise ValueError("Array 'data' must be at most two dimensional, " - "but got data.ndim = %d" % data.ndim) - result = ma.apply_along_axis(_hd_1D, axis, data, p, var) - - return ma.fix_invalid(result, copy=False) - - -def hdmedian(data, axis=-1, var=False): - """ - Returns the Harrell-Davis estimate of the median along the given axis. - - Parameters - ---------- - data : ndarray - Data array. - axis : int - Axis along which to compute the quantiles. If None, use a flattened - array. - var : boolean - Whether to return the variance of the estimate. - - """ - result = hdquantiles(data,[0.5], axis=axis, var=var) - return result.squeeze() - - -def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None): - """ - The standard error of the Harrell-Davis quantile estimates by jackknife. - - Parameters - ---------- - data : array_like - Data array. - prob : sequence - Sequence of quantiles to compute. - axis : int - Axis along which to compute the quantiles. If None, use a flattened - array. - - Returns - ------- - hdquantiles_sd : MaskedArray - Standard error of the Harrell-Davis quantile estimates. - - """ - def _hdsd_1D(data,prob): - "Computes the std error for 1D arrays." - xsorted = np.sort(data.compressed()) - n = len(xsorted) - #......... - hdsd = np.empty(len(prob), float_) - if n < 2: - hdsd.flat = np.nan - - vv = np.arange(n) / float(n-1) - betacdf = beta.cdf - - for (i,p) in enumerate(prob): - _w = betacdf(vv, (n+1)*p, (n+1)*(1-p)) - w = _w[1:] - _w[:-1] - mx_ = np.fromiter([np.dot(w,xsorted[np.r_[list(range(0,k)), - list(range(k+1,n))].astype(int_)]) - for k in range(n)], dtype=float_) - mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1) - hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n)) - return hdsd - # Initialization & checks - data = ma.array(data, copy=False, dtype=float_) - p = np.array(prob, copy=False, ndmin=1) - # Computes quantiles along axis (or globally) - if (axis is None): - result = _hdsd_1D(data, p) - else: - if data.ndim > 2: - raise ValueError("Array 'data' must be at most two dimensional, " - "but got data.ndim = %d" % data.ndim) - result = ma.apply_along_axis(_hdsd_1D, axis, data, p) - - return ma.fix_invalid(result, copy=False).ravel() - - -def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True), - alpha=0.05, axis=None): - """ - Selected confidence interval of the trimmed mean along the given axis. - - Parameters - ---------- - data : array_like - Input data. - limits : {None, tuple}, optional - None or a two item tuple. - Tuple of the percentages to cut on each side of the array, with respect - to the number of unmasked data, as floats between 0. and 1. If ``n`` - is the number of unmasked data before trimming, then - (``n * limits[0]``)th smallest data and (``n * limits[1]``)th - largest data are masked. The total number of unmasked data after - trimming is ``n * (1. - sum(limits))``. - The value of one limit can be set to None to indicate an open interval. - - Defaults to (0.2, 0.2). - inclusive : (2,) tuple of boolean, optional - If relative==False, tuple indicating whether values exactly equal to - the absolute limits are allowed. - If relative==True, tuple indicating whether the number of data being - masked on each side should be rounded (True) or truncated (False). - - Defaults to (True, True). - alpha : float, optional - Confidence level of the intervals. - - Defaults to 0.05. - axis : int, optional - Axis along which to cut. If None, uses a flattened version of `data`. - - Defaults to None. - - Returns - ------- - trimmed_mean_ci : (2,) ndarray - The lower and upper confidence intervals of the trimmed data. - - """ - data = ma.array(data, copy=False) - trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis) - tmean = trimmed.mean(axis) - tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis) - df = trimmed.count(axis) - 1 - tppf = t.ppf(1-alpha/2.,df) - return np.array((tmean - tppf*tstde, tmean+tppf*tstde)) - - -def mjci(data, prob=[0.25,0.5,0.75], axis=None): - """ - Returns the Maritz-Jarrett estimators of the standard error of selected - experimental quantiles of the data. - - Parameters - ---------- - data: ndarray - Data array. - prob: sequence - Sequence of quantiles to compute. - axis : int - Axis along which to compute the quantiles. If None, use a flattened - array. - - """ - def _mjci_1D(data, p): - data = np.sort(data.compressed()) - n = data.size - prob = (np.array(p) * n + 0.5).astype(int_) - betacdf = beta.cdf - - mj = np.empty(len(prob), float_) - x = np.arange(1,n+1, dtype=float_) / n - y = x - 1./n - for (i,m) in enumerate(prob): - (m1,m2) = (m-1, n-m) - W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m) - C1 = np.dot(W,data) - C2 = np.dot(W,data**2) - mj[i] = np.sqrt(C2 - C1**2) - return mj - - data = ma.array(data, copy=False) - if data.ndim > 2: - raise ValueError("Array 'data' must be at most two dimensional, " - "but got data.ndim = %d" % data.ndim) - - p = np.array(prob, copy=False, ndmin=1) - # Computes quantiles along axis (or globally) - if (axis is None): - return _mjci_1D(data, p) - else: - return ma.apply_along_axis(_mjci_1D, axis, data, p) - - -def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None): - """ - Computes the alpha confidence interval for the selected quantiles of the - data, with Maritz-Jarrett estimators. - - Parameters - ---------- - data : ndarray - Data array. - prob : sequence - Sequence of quantiles to compute. - alpha : float - Confidence level of the intervals. - axis : integer - Axis along which to compute the quantiles. - If None, use a flattened array. - - """ - alpha = min(alpha, 1-alpha) - z = norm.ppf(1-alpha/2.) - xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis) - smj = mjci(data, prob, axis=axis) - return (xq - z * smj, xq + z * smj) - - -def median_cihs(data, alpha=0.05, axis=None): - """ - Computes the alpha-level confidence interval for the median of the data. - - Uses the Hettmasperger-Sheather method. - - Parameters - ---------- - data : array_like - Input data. Masked values are discarded. The input should be 1D only, - or `axis` should be set to None. - alpha : float - Confidence level of the intervals. - axis : integer - Axis along which to compute the quantiles. If None, use a flattened - array. - - Returns - ------- - median_cihs : - Alpha level confidence interval. - - """ - def _cihs_1D(data, alpha): - data = np.sort(data.compressed()) - n = len(data) - alpha = min(alpha, 1-alpha) - k = int(binom._ppf(alpha/2., n, 0.5)) - gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) - if gk < 1-alpha: - k -= 1 - gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) - gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5) - I = (gk - 1 + alpha)/(gk - gkk) - lambd = (n-k) * I / float(k + (n-2*k)*I) - lims = (lambd*data[k] + (1-lambd)*data[k-1], - lambd*data[n-k-1] + (1-lambd)*data[n-k]) - return lims - data = ma.rray(data, copy=False) - # Computes quantiles along axis (or globally) - if (axis is None): - result = _cihs_1D(data.compressed(), alpha) - else: - if data.ndim > 2: - raise ValueError("Array 'data' must be at most two dimensional, " - "but got data.ndim = %d" % data.ndim) - result = ma.apply_along_axis(_cihs_1D, axis, data, alpha) - - return result - - -def compare_medians_ms(group_1, group_2, axis=None): - """ - Compares the medians from two independent groups along the given axis. - - The comparison is performed using the McKean-Schrader estimate of the - standard error of the medians. - - Parameters - ---------- - group_1 : array_like - First dataset. - group_2 : array_like - Second dataset. - axis : int, optional - Axis along which the medians are estimated. If None, the arrays are - flattened. If `axis` is not None, then `group_1` and `group_2` - should have the same shape. - - Returns - ------- - compare_medians_ms : {float, ndarray} - If `axis` is None, then returns a float, otherwise returns a 1-D - ndarray of floats with a length equal to the length of `group_1` - along `axis`. - - """ - (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis)) - (std_1, std_2) = (mstats.stde_median(group_1, axis=axis), - mstats.stde_median(group_2, axis=axis)) - W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2) - return 1 - norm.cdf(W) - - -def idealfourths(data, axis=None): - """ - Returns an estimate of the lower and upper quartiles. - - Uses the ideal fourths algorithm. - - Parameters - ---------- - data : array_like - Input array. - axis : int, optional - Axis along which the quartiles are estimated. If None, the arrays are - flattened. - - Returns - ------- - idealfourths : {list of floats, masked array} - Returns the two internal values that divide `data` into four parts - using the ideal fourths algorithm either along the flattened array - (if `axis` is None) or along `axis` of `data`. - - """ - def _idf(data): - x = data.compressed() - n = len(x) - if n < 3: - return [np.nan,np.nan] - (j,h) = divmod(n/4. + 5/12.,1) - j = int(j) - qlo = (1-h)*x[j-1] + h*x[j] - k = n - j - qup = (1-h)*x[k] + h*x[k-1] - return [qlo, qup] - data = ma.sort(data, axis=axis).view(MaskedArray) - if (axis is None): - return _idf(data) - else: - return ma.apply_along_axis(_idf, axis, data) - - -def rsh(data, points=None): - """ - Evaluates Rosenblatt's shifted histogram estimators for each point - on the dataset 'data'. - - Parameters - ---------- - data : sequence - Input data. Masked values are ignored. - points : sequence - Sequence of points where to evaluate Rosenblatt shifted histogram. - If None, use the data. - - """ - data = ma.array(data, copy=False) - if points is None: - points = data - else: - points = np.array(points, copy=False, ndmin=1) - - if data.ndim != 1: - raise AttributeError("The input array should be 1D only !") - - n = data.count() - r = idealfourths(data, axis=None) - h = 1.2 * (r[-1]-r[0]) / n**(1./5) - nhi = (data[:,None] <= points[None,:] + h).sum(0) - nlo = (data[:,None] < points[None,:] - h).sum(0) - return (nhi-nlo) / (2.*n*h) diff --git a/wafo/stats/rv.py b/wafo/stats/rv.py deleted file mode 100644 index 1ef8b36..0000000 --- a/wafo/stats/rv.py +++ /dev/null @@ -1,76 +0,0 @@ -from __future__ import division, print_function, absolute_import - -from numpy import vectorize, deprecate -from numpy.random import random_sample - -__all__ = ['randwppf', 'randwcdf'] - -# XXX: Are these needed anymore? - -##################################### -# General purpose continuous -###################################### - - -@deprecate(message="Deprecated in scipy 0.14.0, use " - "distribution-specific rvs() method instead") -def randwppf(ppf, args=(), size=None): - """ - returns an array of randomly distributed integers of a distribution - whose percent point function (inverse of the CDF or quantile function) - is given. - - args is a tuple of extra arguments to the ppf function (i.e. shape, - location, scale), and size is the size of the output. Note the ppf - function must accept an array of q values to compute over. - - """ - U = random_sample(size=size) - return ppf(*(U,)+args) - - -@deprecate(message="Deprecated in scipy 0.14.0, use " - "distribution-specific rvs() method instead") -def randwcdf(cdf, mean=1.0, args=(), size=None): - """ - Returns an array of randomly distributed integers given a CDF. - - Given a cumulative distribution function (CDF) returns an array of - randomly distributed integers that would satisfy the CDF. - - Parameters - ---------- - cdf : function - CDF function that accepts a single value and `args`, and returns - an single value. - mean : float, optional - The mean of the distribution which helps the solver. Defaults - to 1.0. - args : tuple, optional - Extra arguments to the cdf function (i.e. shape, location, scale) - size : {int, None}, optional - Is the size of the output. If None, only 1 value will be returned. - - Returns - ------- - randwcdf : ndarray - Array of random numbers. - - Notes - ----- - Can use the ``scipy.stats.distributions.*.cdf`` functions for the - `cdf` parameter. - - """ - import scipy.optimize as optimize - - def _ppfopt(x, q, *nargs): - newargs = (x,)+nargs - return cdf(*newargs) - q - - def _ppf(q, *nargs): - return optimize.fsolve(_ppfopt, mean, args=(q,)+nargs) - - _vppf = vectorize(_ppf) - U = random_sample(size=size) - return _vppf(*(U,)+args) diff --git a/wafo/stats/six.py b/wafo/stats/six.py deleted file mode 100644 index f0bc0b6..0000000 --- a/wafo/stats/six.py +++ /dev/null @@ -1,389 +0,0 @@ -"""Utilities for writing code that runs on Python 2 and 3""" - -# Copyright (c) 2010-2012 Benjamin Peterson -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of -# this software and associated documentation files (the "Software"), to deal in -# the Software without restriction, including without limitation the rights to -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, -# subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -import operator -import sys -import types - -__author__ = "Benjamin Peterson " -__version__ = "1.2.0" - - -# True if we are running on Python 3. -PY3 = sys.version_info[0] == 3 - -if PY3: - string_types = str, - integer_types = int, - class_types = type, - text_type = str - binary_type = bytes - - MAXSIZE = sys.maxsize -else: - string_types = basestring, - integer_types = (int, long) - class_types = (type, types.ClassType) - text_type = unicode - binary_type = str - - if sys.platform.startswith("java"): - # Jython always uses 32 bits. - MAXSIZE = int((1 << 31) - 1) - else: - # It's possible to have sizeof(long) != sizeof(Py_ssize_t). - class X(object): - def __len__(self): - return 1 << 31 - try: - len(X()) - except OverflowError: - # 32-bit - MAXSIZE = int((1 << 31) - 1) - else: - # 64-bit - MAXSIZE = int((1 << 63) - 1) - del X - - -def _add_doc(func, doc): - """Add documentation to a function.""" - func.__doc__ = doc - - -def _import_module(name): - """Import module, returning the module after the last dot.""" - __import__(name) - return sys.modules[name] - - -class _LazyDescr(object): - - def __init__(self, name): - self.name = name - - def __get__(self, obj, tp): - result = self._resolve() - setattr(obj, self.name, result) - # This is a bit ugly, but it avoids running this again. - delattr(tp, self.name) - return result - - -class MovedModule(_LazyDescr): - - def __init__(self, name, old, new=None): - super(MovedModule, self).__init__(name) - if PY3: - if new is None: - new = name - self.mod = new - else: - self.mod = old - - def _resolve(self): - return _import_module(self.mod) - - -class MovedAttribute(_LazyDescr): - - def __init__(self, name, old_mod, new_mod, old_attr=None, new_attr=None): - super(MovedAttribute, self).__init__(name) - if PY3: - if new_mod is None: - new_mod = name - self.mod = new_mod - if new_attr is None: - if old_attr is None: - new_attr = name - else: - new_attr = old_attr - self.attr = new_attr - else: - self.mod = old_mod - if old_attr is None: - old_attr = name - self.attr = old_attr - - def _resolve(self): - module = _import_module(self.mod) - return getattr(module, self.attr) - - -class _MovedItems(types.ModuleType): - """Lazy loading of moved objects""" - - -_moved_attributes = [ - MovedAttribute("cStringIO", "cStringIO", "io", "StringIO"), - MovedAttribute("filter", "itertools", "builtins", "ifilter", "filter"), - MovedAttribute("input", "__builtin__", "builtins", "raw_input", "input"), - MovedAttribute("map", "itertools", "builtins", "imap", "map"), - MovedAttribute("reload_module", "__builtin__", "imp", "reload"), - MovedAttribute("reduce", "__builtin__", "functools"), - MovedAttribute("StringIO", "StringIO", "io"), - MovedAttribute("xrange", "__builtin__", "builtins", "xrange", "range"), - MovedAttribute("zip", "itertools", "builtins", "izip", "zip"), - - MovedModule("builtins", "__builtin__"), - MovedModule("configparser", "ConfigParser"), - MovedModule("copyreg", "copy_reg"), - MovedModule("http_cookiejar", "cookielib", "http.cookiejar"), - MovedModule("http_cookies", "Cookie", "http.cookies"), - MovedModule("html_entities", "htmlentitydefs", "html.entities"), - MovedModule("html_parser", "HTMLParser", "html.parser"), - MovedModule("http_client", "httplib", "http.client"), - MovedModule("email_mime_multipart", "email.MIMEMultipart", "email.mime.multipart"), - MovedModule("email_mime_text", "email.MIMEText", "email.mime.text"), - MovedModule("email_mime_base", "email.MIMEBase", "email.mime.base"), - MovedModule("BaseHTTPServer", "BaseHTTPServer", "http.server"), - MovedModule("CGIHTTPServer", "CGIHTTPServer", "http.server"), - MovedModule("SimpleHTTPServer", "SimpleHTTPServer", "http.server"), - MovedModule("cPickle", "cPickle", "pickle"), - MovedModule("queue", "Queue"), - MovedModule("reprlib", "repr"), - MovedModule("socketserver", "SocketServer"), - MovedModule("tkinter", "Tkinter"), - MovedModule("tkinter_dialog", "Dialog", "tkinter.dialog"), - MovedModule("tkinter_filedialog", "FileDialog", "tkinter.filedialog"), - MovedModule("tkinter_scrolledtext", "ScrolledText", "tkinter.scrolledtext"), - MovedModule("tkinter_simpledialog", "SimpleDialog", "tkinter.simpledialog"), - MovedModule("tkinter_tix", "Tix", "tkinter.tix"), - MovedModule("tkinter_constants", "Tkconstants", "tkinter.constants"), - MovedModule("tkinter_dnd", "Tkdnd", "tkinter.dnd"), - MovedModule("tkinter_colorchooser", "tkColorChooser", - "tkinter.colorchooser"), - MovedModule("tkinter_commondialog", "tkCommonDialog", - "tkinter.commondialog"), - MovedModule("tkinter_tkfiledialog", "tkFileDialog", "tkinter.filedialog"), - MovedModule("tkinter_font", "tkFont", "tkinter.font"), - MovedModule("tkinter_messagebox", "tkMessageBox", "tkinter.messagebox"), - MovedModule("tkinter_tksimpledialog", "tkSimpleDialog", - "tkinter.simpledialog"), - MovedModule("urllib_robotparser", "robotparser", "urllib.robotparser"), - MovedModule("winreg", "_winreg"), -] -for attr in _moved_attributes: - setattr(_MovedItems, attr.name, attr) -del attr - -moves = sys.modules[__name__ + ".moves"] = _MovedItems("moves") - - -def add_move(move): - """Add an item to six.moves.""" - setattr(_MovedItems, move.name, move) - - -def remove_move(name): - """Remove item from six.moves.""" - try: - delattr(_MovedItems, name) - except AttributeError: - try: - del moves.__dict__[name] - except KeyError: - raise AttributeError("no such move, %r" % (name,)) - - -if PY3: - _meth_func = "__func__" - _meth_self = "__self__" - - _func_code = "__code__" - _func_defaults = "__defaults__" - - _iterkeys = "keys" - _itervalues = "values" - _iteritems = "items" -else: - _meth_func = "im_func" - _meth_self = "im_self" - - _func_code = "func_code" - _func_defaults = "func_defaults" - - _iterkeys = "iterkeys" - _itervalues = "itervalues" - _iteritems = "iteritems" - - -try: - advance_iterator = next -except NameError: - def advance_iterator(it): - return it.next() -next = advance_iterator - - -if PY3: - def get_unbound_function(unbound): - return unbound - - Iterator = object - - def callable(obj): - return any("__call__" in klass.__dict__ for klass in type(obj).__mro__) -else: - def get_unbound_function(unbound): - return unbound.im_func - - class Iterator(object): - - def next(self): - return type(self).__next__(self) - - callable = callable -_add_doc(get_unbound_function, - """Get the function out of a possibly unbound function""") - - -get_method_function = operator.attrgetter(_meth_func) -get_method_self = operator.attrgetter(_meth_self) -get_function_code = operator.attrgetter(_func_code) -get_function_defaults = operator.attrgetter(_func_defaults) - - -def iterkeys(d): - """Return an iterator over the keys of a dictionary.""" - return iter(getattr(d, _iterkeys)()) - - -def itervalues(d): - """Return an iterator over the values of a dictionary.""" - return iter(getattr(d, _itervalues)()) - - -def iteritems(d): - """Return an iterator over the (key, value) pairs of a dictionary.""" - return iter(getattr(d, _iteritems)()) - - -if PY3: - def b(s): - return s.encode("latin-1") - - def u(s): - return s - - if sys.version_info[1] <= 1: - def int2byte(i): - return bytes((i,)) - else: - # This is about 2x faster than the implementation above on 3.2+ - int2byte = operator.methodcaller("to_bytes", 1, "big") - import io - StringIO = io.StringIO - BytesIO = io.BytesIO -else: - def b(s): - return s - - def u(s): - return unicode(s, "unicode_escape") - int2byte = chr - import StringIO - StringIO = BytesIO = StringIO.StringIO -_add_doc(b, """Byte literal""") -_add_doc(u, """Text literal""") - - -if PY3: - import builtins # @UnresolvedImport - exec_ = getattr(builtins, "exec") - - def reraise(tp, value, tb=None): - if value.__traceback__ is not tb: - raise value.with_traceback(tb) - raise value - - print_ = getattr(builtins, "print") - del builtins - -else: - def exec_(code, globs=None, locs=None): - """Execute code in a namespace.""" - if globs is None: - frame = sys._getframe(1) - globs = frame.f_globals - if locs is None: - locs = frame.f_locals - del frame - elif locs is None: - locs = globs - exec("""exec code in globs, locs""") - - exec_("""def reraise(tp, value, tb=None): - raise tp, value, tb -""") - - def print_(*args, **kwargs): - """The new-style print function.""" - fp = kwargs.pop("file", sys.stdout) - if fp is None: - return - - def write(data): - if not isinstance(data, basestring): - data = str(data) - fp.write(data) - want_unicode = False - sep = kwargs.pop("sep", None) - if sep is not None: - if isinstance(sep, unicode): - want_unicode = True - elif not isinstance(sep, str): - raise TypeError("sep must be None or a string") - end = kwargs.pop("end", None) - if end is not None: - if isinstance(end, unicode): - want_unicode = True - elif not isinstance(end, str): - raise TypeError("end must be None or a string") - if kwargs: - raise TypeError("invalid keyword arguments to print()") - if not want_unicode: - for arg in args: - if isinstance(arg, unicode): - want_unicode = True - break - if want_unicode: - newline = unicode("\n") - space = unicode(" ") - else: - newline = "\n" - space = " " - if sep is None: - sep = space - if end is None: - end = newline - for i, arg in enumerate(args): - if i: - write(sep) - write(arg) - write(end) - -_add_doc(reraise, """Reraise an exception.""") - - -def with_metaclass(meta, base=object): - """Create a base class with a metaclass.""" - return meta("NewBase", (base,), {}) diff --git a/wafo/stats/stats.py b/wafo/stats/stats.py deleted file mode 100644 index ddf779b..0000000 --- a/wafo/stats/stats.py +++ /dev/null @@ -1,4508 +0,0 @@ -# Copyright (c) Gary Strangman. All rights reserved -# -# Disclaimer -# -# This software is provided "as-is". There are no expressed or implied -# warranties of any kind, including, but not limited to, the warranties -# of merchantability and fitness for a given application. In no event -# shall Gary Strangman be liable for any direct, indirect, incidental, -# special, exemplary or consequential damages (including, but not limited -# to, loss of use, data or profits, or business interruption) however -# caused and on any theory of liability, whether in contract, strict -# liability or tort (including negligence or otherwise) arising in any way -# out of the use of this software, even if advised of the possibility of -# such damage. -# - -# -# Heavily adapted for use by SciPy 2002 by Travis Oliphant -""" -A collection of basic statistical functions for python. The function -names appear below. - - Some scalar functions defined here are also available in the scipy.special - package where they work on arbitrary sized arrays. - -Disclaimers: The function list is obviously incomplete and, worse, the -functions are not optimized. All functions have been tested (some more -so than others), but they are far from bulletproof. Thus, as with any -free software, no warranty or guarantee is expressed or implied. :-) A -few extra functions that don't appear in the list below can be found by -interested treasure-hunters. These functions don't necessarily have -both list and array versions but were deemed useful. - -Central Tendency ----------------- -.. autosummary:: - :toctree: generated/ - - gmean - hmean - mode - -Moments -------- -.. autosummary:: - :toctree: generated/ - - moment - variation - skew - kurtosis - normaltest - -Moments Handling NaN: - -.. autosummary:: - :toctree: generated/ - - nanmean - nanmedian - nanstd - -Altered Versions ----------------- -.. autosummary:: - :toctree: generated/ - - tmean - tvar - tstd - tsem - describe - -Frequency Stats ---------------- -.. autosummary:: - :toctree: generated/ - - itemfreq - scoreatpercentile - percentileofscore - histogram - cumfreq - relfreq - -Variability ------------ -.. autosummary:: - :toctree: generated/ - - obrientransform - signaltonoise - sem - -Trimming Functions ------------------- -.. autosummary:: - :toctree: generated/ - - threshold - trimboth - trim1 - -Correlation Functions ---------------------- -.. autosummary:: - :toctree: generated/ - - pearsonr - fisher_exact - spearmanr - pointbiserialr - kendalltau - linregress - theilslopes - -Inferential Stats ------------------ -.. autosummary:: - :toctree: generated/ - - ttest_1samp - ttest_ind - ttest_rel - chisquare - power_divergence - ks_2samp - mannwhitneyu - ranksums - wilcoxon - kruskal - friedmanchisquare - -Probability Calculations ------------------------- -.. autosummary:: - :toctree: generated/ - - chisqprob - zprob - fprob - betai - -ANOVA Functions ---------------- -.. autosummary:: - :toctree: generated/ - - f_oneway - f_value - -Support Functions ------------------ -.. autosummary:: - :toctree: generated/ - - ss - square_of_sums - rankdata - -References ----------- -.. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard - Probability and Statistics Tables and Formulae. Chapman & Hall: New - York. 2000. - -""" - -from __future__ import division, print_function, absolute_import - -import warnings -import math - -from scipy._lib.six import xrange - -# friedmanchisquare patch uses python sum -pysum = sum # save it before it gets overwritten - -# Scipy imports. -from scipy._lib.six import callable, string_types -from numpy import array, asarray, ma, zeros, sum -import scipy.special as special -import scipy.linalg as linalg -import numpy as np - -from . import futil -from . import distributions -try: - from scipy.stats._rank import rankdata, tiecorrect -except: - rankdata = tiecorrect = None -__all__ = ['find_repeats', 'gmean', 'hmean', 'mode', 'tmean', 'tvar', - 'tmin', 'tmax', 'tstd', 'tsem', 'moment', 'variation', - 'skew', 'kurtosis', 'describe', 'skewtest', 'kurtosistest', - 'normaltest', 'jarque_bera', 'itemfreq', - 'scoreatpercentile', 'percentileofscore', 'histogram', - 'histogram2', 'cumfreq', 'relfreq', 'obrientransform', - 'signaltonoise', 'sem', 'zmap', 'zscore', 'threshold', - 'sigmaclip', 'trimboth', 'trim1', 'trim_mean', 'f_oneway', - 'pearsonr', 'fisher_exact', 'spearmanr', 'pointbiserialr', - 'kendalltau', 'linregress', 'theilslopes', 'ttest_1samp', - 'ttest_ind', 'ttest_rel', 'kstest', 'chisquare', - 'power_divergence', 'ks_2samp', 'mannwhitneyu', - 'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare', - 'zprob', 'chisqprob', 'ksprob', 'fprob', 'betai', - 'f_value_wilks_lambda', 'f_value', 'f_value_multivariate', - 'ss', 'square_of_sums', 'fastsort', 'rankdata', 'nanmean', - 'nanstd', 'nanmedian', ] - - -def _chk_asarray(a, axis): - if axis is None: - a = np.ravel(a) - outaxis = 0 - else: - a = np.asarray(a) - outaxis = axis - return a, outaxis - - -def _chk2_asarray(a, b, axis): - if axis is None: - a = np.ravel(a) - b = np.ravel(b) - outaxis = 0 - else: - a = np.asarray(a) - b = np.asarray(b) - outaxis = axis - return a, b, outaxis - - -def find_repeats(arr): - """ - Find repeats and repeat counts. - - Parameters - ---------- - arr : array_like - Input array - - Returns - ------- - find_repeats : tuple - Returns a tuple of two 1-D ndarrays. The first ndarray are the repeats - as sorted, unique values that are repeated in `arr`. The second - ndarray are the counts mapped one-to-one of the repeated values - in the first ndarray. - - Examples - -------- - >>> import scipy.stats as stats - >>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5]) - (array([ 2. ]), array([ 4 ], dtype=int32) - - >>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]]) - (array([ 4., 5.]), array([2, 2], dtype=int32)) - - """ - v1,v2, n = futil.dfreps(arr) - return v1[:n],v2[:n] - -####### -### NAN friendly functions -######## - - -def nanmean(x, axis=0): - """ - Compute the mean over the given axis ignoring nans. - - Parameters - ---------- - x : ndarray - Input array. - axis : int, optional - Axis along which the mean is computed. Default is 0, i.e. the - first axis. - - Returns - ------- - m : float - The mean of `x`, ignoring nans. - - See Also - -------- - nanstd, nanmedian - - Examples - -------- - >>> from scipy import stats - >>> a = np.linspace(0, 4, 3) - >>> a - array([ 0., 2., 4.]) - >>> a[-1] = np.nan - >>> stats.nanmean(a) - 1.0 - - """ - x, axis = _chk_asarray(x, axis) - x = x.copy() - Norig = x.shape[axis] - mask = np.isnan(x) - factor = 1.0 - np.sum(mask, axis) / Norig - - x[mask] = 0.0 - return np.mean(x, axis) / factor - - -def nanstd(x, axis=0, bias=False): - """ - Compute the standard deviation over the given axis, ignoring nans. - - Parameters - ---------- - x : array_like - Input array. - axis : int or None, optional - Axis along which the standard deviation is computed. Default is 0. - If None, compute over the whole array `x`. - bias : bool, optional - If True, the biased (normalized by N) definition is used. If False - (default), the unbiased definition is used. - - Returns - ------- - s : float - The standard deviation. - - See Also - -------- - nanmean, nanmedian - - Examples - -------- - >>> from scipy import stats - >>> a = np.arange(10, dtype=float) - >>> a[1:3] = np.nan - >>> np.std(a) - nan - >>> stats.nanstd(a) - 2.9154759474226504 - >>> stats.nanstd(a.reshape(2, 5), axis=1) - array([ 2.0817, 1.5811]) - >>> stats.nanstd(a.reshape(2, 5), axis=None) - 2.9154759474226504 - - """ - x, axis = _chk_asarray(x, axis) - x = x.copy() - Norig = x.shape[axis] - - mask = np.isnan(x) - Nnan = np.sum(mask, axis) * 1.0 - n = Norig - Nnan - - x[mask] = 0.0 - m1 = np.sum(x, axis) / n - - if axis: - d = x - np.expand_dims(m1, axis) - else: - d = x - m1 - - d *= d - - m2 = np.sum(d, axis) - m1 * m1 * Nnan - - if bias: - m2c = m2 / n - else: - m2c = m2 / (n - 1.0) - - return np.sqrt(m2c) - - -def _nanmedian(arr1d): # This only works on 1d arrays - """Private function for rank a arrays. Compute the median ignoring Nan. - - Parameters - ---------- - arr1d : ndarray - Input array, of rank 1. - - Results - ------- - m : float - The median. - """ - x = arr1d.copy() - c = np.isnan(x) - s = np.where(c)[0] - if s.size == x.size: - warnings.warn("All-NaN slice encountered", RuntimeWarning) - return np.nan - elif s.size != 0: - # select non-nans at end of array - enonan = x[-s.size:][~c[-s.size:]] - # fill nans in beginning of array with non-nans of end - x[s[:enonan.size]] = enonan - # slice nans away - x = x[:-s.size] - return np.median(x, overwrite_input=True) - - -def nanmedian(x, axis=0): - """ - Compute the median along the given axis ignoring nan values. - - Parameters - ---------- - x : array_like - Input array. - axis : int, optional - Axis along which the median is computed. Default is 0, i.e. the - first axis. - - Returns - ------- - m : float - The median of `x` along `axis`. - - See Also - -------- - nanstd, nanmean, numpy.nanmedian - - Examples - -------- - >>> from scipy import stats - >>> a = np.array([0, 3, 1, 5, 5, np.nan]) - >>> stats.nanmedian(a) - array(3.0) - - >>> b = np.array([0, 3, 1, 5, 5, np.nan, 5]) - >>> stats.nanmedian(b) - array(4.0) - - Example with axis: - - >>> c = np.arange(30.).reshape(5,6) - >>> idx = np.array([False, False, False, True, False] * 6).reshape(5,6) - >>> c[idx] = np.nan - >>> c - array([[ 0., 1., 2., nan, 4., 5.], - [ 6., 7., nan, 9., 10., 11.], - [ 12., nan, 14., 15., 16., 17.], - [ nan, 19., 20., 21., 22., nan], - [ 24., 25., 26., 27., nan, 29.]]) - >>> stats.nanmedian(c, axis=1) - array([ 2. , 9. , 15. , 20.5, 26. ]) - - """ - x, axis = _chk_asarray(x, axis) - if x.ndim == 0: - return float(x.item()) - if hasattr(np, 'nanmedian'): # numpy 1.9 faster for some cases - return np.nanmedian(x, axis) - x = np.apply_along_axis(_nanmedian, axis, x) - if x.ndim == 0: - x = float(x.item()) - return x - - -##################################### -######## CENTRAL TENDENCY ######## -##################################### - - -def gmean(a, axis=0, dtype=None): - """ - Compute the geometric mean along the specified axis. - - Returns the geometric average of the array elements. - That is: n-th root of (x1 * x2 * ... * xn) - - Parameters - ---------- - a : array_like - Input array or object that can be converted to an array. - axis : int, optional, default axis=0 - Axis along which the geometric mean is computed. - dtype : dtype, optional - Type of the returned array and of the accumulator in which the - elements are summed. If dtype is not specified, it defaults to the - dtype of a, unless a has an integer dtype with a precision less than - that of the default platform integer. In that case, the default - platform integer is used. - - Returns - ------- - gmean : ndarray - see dtype parameter above - - See Also - -------- - numpy.mean : Arithmetic average - numpy.average : Weighted average - hmean : Harmonic mean - - Notes - ----- - The geometric average is computed over a single dimension of the input - array, axis=0 by default, or all values in the array if axis=None. - float64 intermediate and return values are used for integer inputs. - - Use masked arrays to ignore any non-finite values in the input or that - arise in the calculations such as Not a Number and infinity because masked - arrays automatically mask any non-finite values. - - """ - if not isinstance(a, np.ndarray): # if not an ndarray object attempt to convert it - log_a = np.log(np.array(a, dtype=dtype)) - elif dtype: # Must change the default dtype allowing array type - if isinstance(a,np.ma.MaskedArray): - log_a = np.log(np.ma.asarray(a, dtype=dtype)) - else: - log_a = np.log(np.asarray(a, dtype=dtype)) - else: - log_a = np.log(a) - return np.exp(log_a.mean(axis=axis)) - - -def hmean(a, axis=0, dtype=None): - """ - Calculates the harmonic mean along the specified axis. - - That is: n / (1/x1 + 1/x2 + ... + 1/xn) - - Parameters - ---------- - a : array_like - Input array, masked array or object that can be converted to an array. - axis : int, optional, default axis=0 - Axis along which the harmonic mean is computed. - dtype : dtype, optional - Type of the returned array and of the accumulator in which the - elements are summed. If `dtype` is not specified, it defaults to the - dtype of `a`, unless `a` has an integer `dtype` with a precision less - than that of the default platform integer. In that case, the default - platform integer is used. - - Returns - ------- - hmean : ndarray - see `dtype` parameter above - - See Also - -------- - numpy.mean : Arithmetic average - numpy.average : Weighted average - gmean : Geometric mean - - Notes - ----- - The harmonic mean is computed over a single dimension of the input - array, axis=0 by default, or all values in the array if axis=None. - float64 intermediate and return values are used for integer inputs. - - Use masked arrays to ignore any non-finite values in the input or that - arise in the calculations such as Not a Number and infinity. - - """ - if not isinstance(a, np.ndarray): - a = np.array(a, dtype=dtype) - if np.all(a > 0): # Harmonic mean only defined if greater than zero - if isinstance(a, np.ma.MaskedArray): - size = a.count(axis) - else: - if axis is None: - a = a.ravel() - size = a.shape[0] - else: - size = a.shape[axis] - return size / np.sum(1.0/a, axis=axis, dtype=dtype) - else: - raise ValueError("Harmonic mean only defined if all elements greater than zero") - - -def mode(a, axis=0): - """ - Returns an array of the modal (most common) value in the passed array. - - If there is more than one such value, only the first is returned. - The bin-count for the modal bins is also returned. - - Parameters - ---------- - a : array_like - n-dimensional array of which to find mode(s). - axis : int, optional - Axis along which to operate. Default is 0, i.e. the first axis. - - Returns - ------- - vals : ndarray - Array of modal values. - counts : ndarray - Array of counts for each mode. - - Examples - -------- - >>> a = np.array([[6, 8, 3, 0], - [3, 2, 1, 7], - [8, 1, 8, 4], - [5, 3, 0, 5], - [4, 7, 5, 9]]) - >>> from scipy import stats - >>> stats.mode(a) - (array([[ 3., 1., 0., 0.]]), array([[ 1., 1., 1., 1.]])) - - To get mode of whole array, specify axis=None: - - >>> stats.mode(a, axis=None) - (array([ 3.]), array([ 3.])) - - """ - a, axis = _chk_asarray(a, axis) - scores = np.unique(np.ravel(a)) # get ALL unique values - testshape = list(a.shape) - testshape[axis] = 1 - oldmostfreq = np.zeros(testshape, dtype=a.dtype) - oldcounts = np.zeros(testshape) - for score in scores: - template = (a == score) - counts = np.expand_dims(np.sum(template, axis),axis) - mostfrequent = np.where(counts > oldcounts, score, oldmostfreq) - oldcounts = np.maximum(counts, oldcounts) - oldmostfreq = mostfrequent - return mostfrequent, oldcounts - - -def mask_to_limits(a, limits, inclusive): - """Mask an array for values outside of given limits. - - This is primarily a utility function. - - Parameters - ---------- - a : array - limits : (float or None, float or None) - A tuple consisting of the (lower limit, upper limit). Values in the - input array less than the lower limit or greater than the upper limit - will be masked out. None implies no limit. - inclusive : (bool, bool) - A tuple consisting of the (lower flag, upper flag). These flags - determine whether values exactly equal to lower or upper are allowed. - - Returns - ------- - A MaskedArray. - - Raises - ------ - A ValueError if there are no values within the given limits. - """ - lower_limit, upper_limit = limits - lower_include, upper_include = inclusive - am = ma.MaskedArray(a) - if lower_limit is not None: - if lower_include: - am = ma.masked_less(am, lower_limit) - else: - am = ma.masked_less_equal(am, lower_limit) - - if upper_limit is not None: - if upper_include: - am = ma.masked_greater(am, upper_limit) - else: - am = ma.masked_greater_equal(am, upper_limit) - - if am.count() == 0: - raise ValueError("No array values within given limits") - - return am - - -def tmean(a, limits=None, inclusive=(True, True)): - """ - Compute the trimmed mean. - - This function finds the arithmetic mean of given values, ignoring values - outside the given `limits`. - - Parameters - ---------- - a : array_like - Array of values. - limits : None or (lower limit, upper limit), optional - Values in the input array less than the lower limit or greater than the - upper limit will be ignored. When limits is None (default), then all - values are used. Either of the limit values in the tuple can also be - None representing a half-open interval. - inclusive : (bool, bool), optional - A tuple consisting of the (lower flag, upper flag). These flags - determine whether values exactly equal to the lower or upper limits - are included. The default value is (True, True). - - Returns - ------- - tmean : float - - """ - a = asarray(a) - if limits is None: - return np.mean(a, None) - - am = mask_to_limits(a.ravel(), limits, inclusive) - return am.mean() - - -def masked_var(am): - m = am.mean() - s = ma.add.reduce((am - m)**2) - n = am.count() - 1.0 - return s / n - - -def tvar(a, limits=None, inclusive=(True, True)): - """ - Compute the trimmed variance - - This function computes the sample variance of an array of values, - while ignoring values which are outside of given `limits`. - - Parameters - ---------- - a : array_like - Array of values. - limits : None or (lower limit, upper limit), optional - Values in the input array less than the lower limit or greater than the - upper limit will be ignored. When limits is None, then all values are - used. Either of the limit values in the tuple can also be None - representing a half-open interval. The default value is None. - inclusive : (bool, bool), optional - A tuple consisting of the (lower flag, upper flag). These flags - determine whether values exactly equal to the lower or upper limits - are included. The default value is (True, True). - - Returns - ------- - tvar : float - Trimmed variance. - - Notes - ----- - `tvar` computes the unbiased sample variance, i.e. it uses a correction - factor ``n / (n - 1)``. - - """ - a = asarray(a) - a = a.astype(float).ravel() - if limits is None: - n = len(a) - return a.var()*(n/(n-1.)) - am = mask_to_limits(a, limits, inclusive) - return masked_var(am) - - -def tmin(a, lowerlimit=None, axis=0, inclusive=True): - """ - Compute the trimmed minimum - - This function finds the miminum value of an array `a` along the - specified axis, but only considering values greater than a specified - lower limit. - - Parameters - ---------- - a : array_like - array of values - lowerlimit : None or float, optional - Values in the input array less than the given limit will be ignored. - When lowerlimit is None, then all values are used. The default value - is None. - axis : None or int, optional - Operate along this axis. None means to use the flattened array and - the default is zero - inclusive : {True, False}, optional - This flag determines whether values exactly equal to the lower limit - are included. The default value is True. - - Returns - ------- - tmin : float - - """ - a, axis = _chk_asarray(a, axis) - am = mask_to_limits(a, (lowerlimit, None), (inclusive, False)) - return ma.minimum.reduce(am, axis) - - -def tmax(a, upperlimit=None, axis=0, inclusive=True): - """ - Compute the trimmed maximum - - This function computes the maximum value of an array along a given axis, - while ignoring values larger than a specified upper limit. - - Parameters - ---------- - a : array_like - array of values - upperlimit : None or float, optional - Values in the input array greater than the given limit will be ignored. - When upperlimit is None, then all values are used. The default value - is None. - axis : None or int, optional - Operate along this axis. None means to use the flattened array and - the default is zero. - inclusive : {True, False}, optional - This flag determines whether values exactly equal to the upper limit - are included. The default value is True. - - Returns - ------- - tmax : float - - """ - a, axis = _chk_asarray(a, axis) - am = mask_to_limits(a, (None, upperlimit), (False, inclusive)) - return ma.maximum.reduce(am, axis) - - -def tstd(a, limits=None, inclusive=(True, True)): - """ - Compute the trimmed sample standard deviation - - This function finds the sample standard deviation of given values, - ignoring values outside the given `limits`. - - Parameters - ---------- - a : array_like - array of values - limits : None or (lower limit, upper limit), optional - Values in the input array less than the lower limit or greater than the - upper limit will be ignored. When limits is None, then all values are - used. Either of the limit values in the tuple can also be None - representing a half-open interval. The default value is None. - inclusive : (bool, bool), optional - A tuple consisting of the (lower flag, upper flag). These flags - determine whether values exactly equal to the lower or upper limits - are included. The default value is (True, True). - - Returns - ------- - tstd : float - - Notes - ----- - `tstd` computes the unbiased sample standard deviation, i.e. it uses a - correction factor ``n / (n - 1)``. - - """ - return np.sqrt(tvar(a, limits, inclusive)) - - -def tsem(a, limits=None, inclusive=(True, True)): - """ - Compute the trimmed standard error of the mean. - - This function finds the standard error of the mean for given - values, ignoring values outside the given `limits`. - - Parameters - ---------- - a : array_like - array of values - limits : None or (lower limit, upper limit), optional - Values in the input array less than the lower limit or greater than the - upper limit will be ignored. When limits is None, then all values are - used. Either of the limit values in the tuple can also be None - representing a half-open interval. The default value is None. - inclusive : (bool, bool), optional - A tuple consisting of the (lower flag, upper flag). These flags - determine whether values exactly equal to the lower or upper limits - are included. The default value is (True, True). - - Returns - ------- - tsem : float - - Notes - ----- - `tsem` uses unbiased sample standard deviation, i.e. it uses a - correction factor ``n / (n - 1)``. - - """ - a = np.asarray(a).ravel() - if limits is None: - return a.std(ddof=1) / np.sqrt(a.size) - - am = mask_to_limits(a, limits, inclusive) - sd = np.sqrt(masked_var(am)) - return sd / np.sqrt(am.count()) - - -##################################### -############ MOMENTS ############# -##################################### - -def moment(a, moment=1, axis=0): - """ - Calculates the nth moment about the mean for a sample. - - Generally used to calculate coefficients of skewness and - kurtosis. - - Parameters - ---------- - a : array_like - data - moment : int - order of central moment that is returned - axis : int or None - Axis along which the central moment is computed. If None, then the data - array is raveled. The default axis is zero. - - Returns - ------- - n-th central moment : ndarray or float - The appropriate moment along the given axis or over all values if axis - is None. The denominator for the moment calculation is the number of - observations, no degrees of freedom correction is done. - - """ - a, axis = _chk_asarray(a, axis) - if moment == 1: - # By definition the first moment about the mean is 0. - shape = list(a.shape) - del shape[axis] - if shape: - # return an actual array of the appropriate shape - return np.zeros(shape, dtype=float) - else: - # the input was 1D, so return a scalar instead of a rank-0 array - return np.float64(0.0) - else: - mn = np.expand_dims(np.mean(a,axis), axis) - s = np.power((a-mn), moment) - return np.mean(s, axis) - - -def variation(a, axis=0): - """ - Computes the coefficient of variation, the ratio of the biased standard - deviation to the mean. - - Parameters - ---------- - a : array_like - Input array. - axis : int or None - Axis along which to calculate the coefficient of variation. - - References - ---------- - .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard - Probability and Statistics Tables and Formulae. Chapman & Hall: New - York. 2000. - - """ - a, axis = _chk_asarray(a, axis) - return a.std(axis)/a.mean(axis) - - -def skew(a, axis=0, bias=True): - """ - Computes the skewness of a data set. - - For normally distributed data, the skewness should be about 0. A skewness - value > 0 means that there is more weight in the left tail of the - distribution. The function `skewtest` can be used to determine if the - skewness value is close enough to 0, statistically speaking. - - Parameters - ---------- - a : ndarray - data - axis : int or None - axis along which skewness is calculated - bias : bool - If False, then the calculations are corrected for statistical bias. - - Returns - ------- - skewness : ndarray - The skewness of values along an axis, returning 0 where all values are - equal. - - References - ---------- - [CRCProbStat2000]_ Section 2.2.24.1 - - .. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard - Probability and Statistics Tables and Formulae. Chapman & Hall: New - York. 2000. - - """ - a, axis = _chk_asarray(a,axis) - n = a.shape[axis] - m2 = moment(a, 2, axis) - m3 = moment(a, 3, axis) - zero = (m2 == 0) - vals = np.where(zero, 0, m3 / m2**1.5) - if not bias: - can_correct = (n > 2) & (m2 > 0) - if can_correct.any(): - m2 = np.extract(can_correct, m2) - m3 = np.extract(can_correct, m3) - nval = np.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5 - np.place(vals, can_correct, nval) - if vals.ndim == 0: - return vals.item() - return vals - - -def kurtosis(a, axis=0, fisher=True, bias=True): - """ - Computes the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher's definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators - - Use `kurtosistest` to see if result is close enough to normal. - - Parameters - ---------- - a : array - data for which the kurtosis is calculated - axis : int or None - Axis along which the kurtosis is calculated - fisher : bool - If True, Fisher's definition is used (normal ==> 0.0). If False, - Pearson's definition is used (normal ==> 3.0). - bias : bool - If False, then the calculations are corrected for statistical bias. - - Returns - ------- - kurtosis : array - The kurtosis of values along an axis. If all values are equal, - return -3 for Fisher's definition and 0 for Pearson's definition. - - References - ---------- - .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard - Probability and Statistics Tables and Formulae. Chapman & Hall: New - York. 2000. - - """ - a, axis = _chk_asarray(a, axis) - n = a.shape[axis] - m2 = moment(a,2,axis) - m4 = moment(a,4,axis) - zero = (m2 == 0) - olderr = np.seterr(all='ignore') - try: - vals = np.where(zero, 0, m4 / m2**2.0) - finally: - np.seterr(**olderr) - - if not bias: - can_correct = (n > 3) & (m2 > 0) - if can_correct.any(): - m2 = np.extract(can_correct, m2) - m4 = np.extract(can_correct, m4) - nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0) - np.place(vals, can_correct, nval+3.0) - - if vals.ndim == 0: - vals = vals.item() # array scalar - - if fisher: - return vals - 3 - else: - return vals - - -def describe(a, axis=0, ddof=1): - """ - Computes several descriptive statistics of the passed array. - - Parameters - ---------- - a : array_like - Input data. - axis : int, optional - Axis along which statistics are calculated. If axis is None, then data - array is raveled. The default axis is zero. - ddof : int, optional - Delta degrees of freedom. Default is 1. - - Returns - ------- - size of the data : int - length of data along axis - (min, max): tuple of ndarrays or floats - minimum and maximum value of data array - arithmetic mean : ndarray or float - mean of data along axis - unbiased variance : ndarray or float - variance of the data along axis, denominator is number of observations - minus one. - biased skewness : ndarray or float - skewness, based on moment calculations with denominator equal to the - number of observations, i.e. no degrees of freedom correction - biased kurtosis : ndarray or float - kurtosis (Fisher), the kurtosis is normalized so that it is zero for the - normal distribution. No degrees of freedom or bias correction is used. - - See Also - -------- - skew, kurtosis - - """ - a, axis = _chk_asarray(a, axis) - n = a.shape[axis] - mm = (np.min(a, axis=axis), np.max(a, axis=axis)) - m = np.mean(a, axis=axis) - v = np.var(a, axis=axis, ddof=ddof) - sk = skew(a, axis) - kurt = kurtosis(a, axis) - return n, mm, m, v, sk, kurt - -##################################### -######## NORMALITY TESTS ########## -##################################### - - -def skewtest(a, axis=0): - """ - Tests whether the skew is different from the normal distribution. - - This function tests the null hypothesis that the skewness of - the population that the sample was drawn from is the same - as that of a corresponding normal distribution. - - Parameters - ---------- - a : array - axis : int or None - - Returns - ------- - z-score : float - The computed z-score for this test. - p-value : float - a 2-sided p-value for the hypothesis test - - Notes - ----- - The sample size must be at least 8. - - """ - a, axis = _chk_asarray(a, axis) - if axis is None: - a = np.ravel(a) - axis = 0 - b2 = skew(a, axis) - n = float(a.shape[axis]) - if n < 8: - raise ValueError( - "skewtest is not valid with less than 8 samples; %i samples" - " were given." % int(n)) - y = b2 * math.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2))) - beta2 = (3.0 * (n * n + 27 * n - 70) * (n + 1) * (n + 3) / - ((n - 2.0) * (n + 5) * (n + 7) * (n + 9))) - W2 = -1 + math.sqrt(2 * (beta2 - 1)) - delta = 1 / math.sqrt(0.5 * math.log(W2)) - alpha = math.sqrt(2.0 / (W2 - 1)) - y = np.where(y == 0, 1, y) - Z = delta * np.log(y / alpha + np.sqrt((y / alpha) ** 2 + 1)) - return Z, 2 * distributions.norm.sf(np.abs(Z)) - - -def kurtosistest(a, axis=0): - """ - Tests whether a dataset has normal kurtosis - - This function tests the null hypothesis that the kurtosis - of the population from which the sample was drawn is that - of the normal distribution: ``kurtosis = 3(n-1)/(n+1)``. - - Parameters - ---------- - a : array - array of the sample data - axis : int or None - the axis to operate along, or None to work on the whole array. - The default is the first axis. - - Returns - ------- - z-score : float - The computed z-score for this test. - p-value : float - The 2-sided p-value for the hypothesis test - - Notes - ----- - Valid only for n>20. The Z-score is set to 0 for bad entries. - - """ - a, axis = _chk_asarray(a, axis) - n = float(a.shape[axis]) - if n < 5: - raise ValueError( - "kurtosistest requires at least 5 observations; %i observations" - " were given." % int(n)) - if n < 20: - warnings.warn( - "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" % - int(n)) - b2 = kurtosis(a, axis, fisher=False) - E = 3.0*(n-1) / (n+1) - varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5)) - x = (b2-E)/np.sqrt(varb2) - sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) / - (n*(n-2)*(n-3))) - A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2))) - term1 = 1 - 2/(9.0*A) - denom = 1 + x*np.sqrt(2/(A-4.0)) - denom = np.where(denom < 0, 99, denom) - term2 = np.where(denom < 0, term1, np.power((1-2.0/A)/denom,1/3.0)) - Z = (term1 - term2) / np.sqrt(2/(9.0*A)) - Z = np.where(denom == 99, 0, Z) - if Z.ndim == 0: - Z = Z[()] - # JPNote: p-value sometimes larger than 1 - # zprob uses upper tail, so Z needs to be positive - return Z, 2 * distributions.norm.sf(np.abs(Z)) - - -def normaltest(a, axis=0): - """ - Tests whether a sample differs from a normal distribution. - - This function tests the null hypothesis that a sample comes - from a normal distribution. It is based on D'Agostino and - Pearson's [1]_, [2]_ test that combines skew and kurtosis to - produce an omnibus test of normality. - - - Parameters - ---------- - a : array_like - The array containing the data to be tested. - axis : int or None - If None, the array is treated as a single data set, regardless of - its shape. Otherwise, each 1-d array along axis `axis` is tested. - - Returns - ------- - k2 : float or array - `s^2 + k^2`, where `s` is the z-score returned by `skewtest` and - `k` is the z-score returned by `kurtosistest`. - p-value : float or array - A 2-sided chi squared probability for the hypothesis test. - - References - ---------- - .. [1] D'Agostino, R. B. (1971), "An omnibus test of normality for - moderate and large sample size," Biometrika, 58, 341-348 - - .. [2] D'Agostino, R. and Pearson, E. S. (1973), "Testing for - departures from normality," Biometrika, 60, 613-622 - - """ - a, axis = _chk_asarray(a, axis) - s, _ = skewtest(a, axis) - k, _ = kurtosistest(a, axis) - k2 = s*s + k*k - return k2, chisqprob(k2,2) - - -def jarque_bera(x): - """ - Perform the Jarque-Bera goodness of fit test on sample data. - - The Jarque-Bera test tests whether the sample data has the skewness and - kurtosis matching a normal distribution. - - Note that this test only works for a large enough number of data samples - (>2000) as the test statistic asymptotically has a Chi-squared distribution - with 2 degrees of freedom. - - Parameters - ---------- - x : array_like - Observations of a random variable. - - Returns - ------- - jb_value : float - The test statistic. - p : float - The p-value for the hypothesis test. - - References - ---------- - .. [1] Jarque, C. and Bera, A. (1980) "Efficient tests for normality, - homoscedasticity and serial independence of regression residuals", - 6 Econometric Letters 255-259. - - Examples - -------- - >>> from scipy import stats - >>> np.random.seed(987654321) - >>> x = np.random.normal(0, 1, 100000) - >>> y = np.random.rayleigh(1, 100000) - >>> stats.jarque_bera(x) - (4.7165707989581342, 0.09458225503041906) - >>> stats.jarque_bera(y) - (6713.7098548143422, 0.0) - - """ - x = np.asarray(x) - n = float(x.size) - if n == 0: - raise ValueError('At least one observation is required.') - - mu = x.mean() - diffx = x - mu - skewness = (1 / n * np.sum(diffx**3)) / (1 / n * np.sum(diffx**2))**(3 / 2.) - kurtosis = (1 / n * np.sum(diffx**4)) / (1 / n * np.sum(diffx**2))**2 - jb_value = n / 6 * (skewness**2 + (kurtosis - 3)**2 / 4) - p = 1 - distributions.chi2.cdf(jb_value, 2) - - return jb_value, p - - -##################################### -###### FREQUENCY FUNCTIONS ####### -##################################### - -def itemfreq(a): - """ - Returns a 2-D array of item frequencies. - - Parameters - ---------- - a : (N,) array_like - Input array. - - Returns - ------- - itemfreq : (K, 2) ndarray - A 2-D frequency table. Column 1 contains sorted, unique values from - `a`, column 2 contains their respective counts. - - Examples - -------- - >>> a = np.array([1, 1, 5, 0, 1, 2, 2, 0, 1, 4]) - >>> stats.itemfreq(a) - array([[ 0., 2.], - [ 1., 4.], - [ 2., 2.], - [ 4., 1.], - [ 5., 1.]]) - >>> np.bincount(a) - array([2, 4, 2, 0, 1, 1]) - - >>> stats.itemfreq(a/10.) - array([[ 0. , 2. ], - [ 0.1, 4. ], - [ 0.2, 2. ], - [ 0.4, 1. ], - [ 0.5, 1. ]]) - - """ - items, inv = np.unique(a, return_inverse=True) - freq = np.bincount(inv) - return np.array([items, freq]).T - - -def scoreatpercentile(a, per, limit=(), interpolation_method='fraction', - axis=None): - """ - Calculate the score at a given percentile of the input sequence. - - For example, the score at `per=50` is the median. If the desired quantile - lies between two data points, we interpolate between them, according to - the value of `interpolation`. If the parameter `limit` is provided, it - should be a tuple (lower, upper) of two values. - - Parameters - ---------- - a : array_like - A 1-D array of values from which to extract score. - per : array_like - Percentile(s) at which to extract score. Values should be in range - [0,100]. - limit : tuple, optional - Tuple of two scalars, the lower and upper limits within which to - compute the percentile. Values of `a` outside - this (closed) interval will be ignored. - interpolation : {'fraction', 'lower', 'higher'}, optional - This optional parameter specifies the interpolation method to use, - when the desired quantile lies between two data points `i` and `j` - - - fraction: ``i + (j - i) * fraction`` where ``fraction`` is the - fractional part of the index surrounded by ``i`` and ``j``. - - lower: ``i``. - - higher: ``j``. - - axis : int, optional - Axis along which the percentiles are computed. The default (None) - is to compute the median along a flattened version of the array. - - Returns - ------- - score : float or ndarray - Score at percentile(s). - - See Also - -------- - percentileofscore, numpy.percentile - - Notes - ----- - This function will become obsolete in the future. - For Numpy 1.9 and higher, `numpy.percentile` provides all the functionality - that `scoreatpercentile` provides. And it's significantly faster. - Therefore it's recommended to use `numpy.percentile` for users that have - numpy >= 1.9. - - Examples - -------- - >>> from scipy import stats - >>> a = np.arange(100) - >>> stats.scoreatpercentile(a, 50) - 49.5 - - """ - # adapted from NumPy's percentile function. When we require numpy >= 1.8, - # the implementation of this function can be replaced by np.percentile. - a = np.asarray(a) - if a.size == 0: - # empty array, return nan(s) with shape matching `per` - if np.isscalar(per): - return np.nan - else: - return np.ones(np.asarray(per).shape, dtype=np.float64) * np.nan - - if limit: - a = a[(limit[0] <= a) & (a <= limit[1])] - - sorted = np.sort(a, axis=axis) - if axis is None: - axis = 0 - - return _compute_qth_percentile(sorted, per, interpolation_method, axis) - - -# handle sequence of per's without calling sort multiple times -def _compute_qth_percentile(sorted, per, interpolation_method, axis): - if not np.isscalar(per): - score = [_compute_qth_percentile(sorted, i, interpolation_method, axis) - for i in per] - return np.array(score) - - if (per < 0) or (per > 100): - raise ValueError("percentile must be in the range [0, 100]") - - indexer = [slice(None)] * sorted.ndim - idx = per / 100. * (sorted.shape[axis] - 1) - - if int(idx) != idx: - # round fractional indices according to interpolation method - if interpolation_method == 'lower': - idx = int(np.floor(idx)) - elif interpolation_method == 'higher': - idx = int(np.ceil(idx)) - elif interpolation_method == 'fraction': - pass # keep idx as fraction and interpolate - else: - raise ValueError("interpolation_method can only be 'fraction', " - "'lower' or 'higher'") - - i = int(idx) - if i == idx: - indexer[axis] = slice(i, i + 1) - weights = array(1) - sumval = 1.0 - else: - indexer[axis] = slice(i, i + 2) - j = i + 1 - weights = array([(j - idx), (idx - i)], float) - wshape = [1] * sorted.ndim - wshape[axis] = 2 - weights.shape = wshape - sumval = weights.sum() - - # Use np.add.reduce (== np.sum but a little faster) to coerce data type - return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval - - -def percentileofscore(a, score, kind='rank'): - """ - The percentile rank of a score relative to a list of scores. - - A `percentileofscore` of, for example, 80% means that 80% of the - scores in `a` are below the given score. In the case of gaps or - ties, the exact definition depends on the optional keyword, `kind`. - - Parameters - ---------- - a : array_like - Array of scores to which `score` is compared. - score : int or float - Score that is compared to the elements in `a`. - kind : {'rank', 'weak', 'strict', 'mean'}, optional - This optional parameter specifies the interpretation of the - resulting score: - - - "rank": Average percentage ranking of score. In case of - multiple matches, average the percentage rankings of - all matching scores. - - "weak": This kind corresponds to the definition of a cumulative - distribution function. A percentileofscore of 80% - means that 80% of values are less than or equal - to the provided score. - - "strict": Similar to "weak", except that only values that are - strictly less than the given score are counted. - - "mean": The average of the "weak" and "strict" scores, often used in - testing. See - - http://en.wikipedia.org/wiki/Percentile_rank - - Returns - ------- - pcos : float - Percentile-position of score (0-100) relative to `a`. - - Examples - -------- - Three-quarters of the given values lie below a given score: - - >>> percentileofscore([1, 2, 3, 4], 3) - 75.0 - - With multiple matches, note how the scores of the two matches, 0.6 - and 0.8 respectively, are averaged: - - >>> percentileofscore([1, 2, 3, 3, 4], 3) - 70.0 - - Only 2/5 values are strictly less than 3: - - >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='strict') - 40.0 - - But 4/5 values are less than or equal to 3: - - >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='weak') - 80.0 - - The average between the weak and the strict scores is - - >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='mean') - 60.0 - - """ - a = np.array(a) - n = len(a) - - if kind == 'rank': - if not(np.any(a == score)): - a = np.append(a, score) - a_len = np.array(list(range(len(a)))) - else: - a_len = np.array(list(range(len(a)))) + 1.0 - - a = np.sort(a) - idx = [a == score] - pct = (np.mean(a_len[idx]) / n) * 100.0 - return pct - - elif kind == 'strict': - return sum(a < score) / float(n) * 100 - elif kind == 'weak': - return sum(a <= score) / float(n) * 100 - elif kind == 'mean': - return (sum(a < score) + sum(a <= score)) * 50 / float(n) - else: - raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'") - - -def histogram2(a, bins): - """ - Compute histogram using divisions in bins. - - Count the number of times values from array `a` fall into - numerical ranges defined by `bins`. Range x is given by - bins[x] <= range_x < bins[x+1] where x =0,N and N is the - length of the `bins` array. The last range is given by - bins[N] <= range_N < infinity. Values less than bins[0] are - not included in the histogram. - - Parameters - ---------- - a : array_like of rank 1 - The array of values to be assigned into bins - bins : array_like of rank 1 - Defines the ranges of values to use during histogramming. - - Returns - ------- - histogram2 : ndarray of rank 1 - Each value represents the occurrences for a given bin (range) of - values. - - """ - # comment: probably obsoleted by numpy.histogram() - n = np.searchsorted(np.sort(a), bins) - n = np.concatenate([n, [len(a)]]) - return n[1:]-n[:-1] - - -def histogram(a, numbins=10, defaultlimits=None, weights=None, printextras=False): - """ - Separates the range into several bins and returns the number of instances - in each bin. - - Parameters - ---------- - a : array_like - Array of scores which will be put into bins. - numbins : int, optional - The number of bins to use for the histogram. Default is 10. - defaultlimits : tuple (lower, upper), optional - The lower and upper values for the range of the histogram. - If no value is given, a range slightly larger then the range of the - values in a is used. Specifically ``(a.min() - s, a.max() + s)``, - where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. - weights : array_like, optional - The weights for each value in `a`. Default is None, which gives each - value a weight of 1.0 - printextras : bool, optional - If True, if there are extra points (i.e. the points that fall outside - the bin limits) a warning is raised saying how many of those points - there are. Default is False. - - Returns - ------- - histogram : ndarray - Number of points (or sum of weights) in each bin. - low_range : float - Lowest value of histogram, the lower limit of the first bin. - binsize : float - The size of the bins (all bins have the same size). - extrapoints : int - The number of points outside the range of the histogram. - - See Also - -------- - numpy.histogram - - Notes - ----- - This histogram is based on numpy's histogram but has a larger range by - default if default limits is not set. - - """ - a = np.ravel(a) - if defaultlimits is None: - # no range given, so use values in `a` - data_min = a.min() - data_max = a.max() - # Have bins extend past min and max values slightly - s = (data_max - data_min) / (2. * (numbins - 1.)) - defaultlimits = (data_min - s, data_max + s) - # use numpy's histogram method to compute bins - hist, bin_edges = np.histogram(a, bins=numbins, range=defaultlimits, - weights=weights) - # hist are not always floats, convert to keep with old output - hist = np.array(hist, dtype=float) - # fixed width for bins is assumed, as numpy's histogram gives - # fixed width bins for int values for 'bins' - binsize = bin_edges[1] - bin_edges[0] - # calculate number of extra points - extrapoints = len([v for v in a - if defaultlimits[0] > v or v > defaultlimits[1]]) - if extrapoints > 0 and printextras: - warnings.warn("Points outside given histogram range = %s" - % extrapoints) - return (hist, defaultlimits[0], binsize, extrapoints) - - -def cumfreq(a, numbins=10, defaultreallimits=None, weights=None): - """ - Returns a cumulative frequency histogram, using the histogram function. - - Parameters - ---------- - a : array_like - Input array. - numbins : int, optional - The number of bins to use for the histogram. Default is 10. - defaultlimits : tuple (lower, upper), optional - The lower and upper values for the range of the histogram. - If no value is given, a range slightly larger than the range of the - values in `a` is used. Specifically ``(a.min() - s, a.max() + s)``, - where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. - weights : array_like, optional - The weights for each value in `a`. Default is None, which gives each - value a weight of 1.0 - - Returns - ------- - cumfreq : ndarray - Binned values of cumulative frequency. - lowerreallimit : float - Lower real limit - binsize : float - Width of each bin. - extrapoints : int - Extra points. - - Examples - -------- - >>> import scipy.stats as stats - >>> x = [1, 4, 2, 1, 3, 1] - >>> cumfreqs, lowlim, binsize, extrapoints = stats.cumfreq(x, numbins=4) - >>> cumfreqs - array([ 3., 4., 5., 6.]) - >>> cumfreqs, lowlim, binsize, extrapoints = \ - ... stats.cumfreq(x, numbins=4, defaultreallimits=(1.5, 5)) - >>> cumfreqs - array([ 1., 2., 3., 3.]) - >>> extrapoints - 3 - - """ - h,l,b,e = histogram(a, numbins, defaultreallimits, weights=weights) - cumhist = np.cumsum(h*1, axis=0) - return cumhist,l,b,e - - -def relfreq(a, numbins=10, defaultreallimits=None, weights=None): - """ - Returns a relative frequency histogram, using the histogram function. - - Parameters - ---------- - a : array_like - Input array. - numbins : int, optional - The number of bins to use for the histogram. Default is 10. - defaultreallimits : tuple (lower, upper), optional - The lower and upper values for the range of the histogram. - If no value is given, a range slightly larger then the range of the - values in a is used. Specifically ``(a.min() - s, a.max() + s)``, - where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. - weights : array_like, optional - The weights for each value in `a`. Default is None, which gives each - value a weight of 1.0 - - Returns - ------- - relfreq : ndarray - Binned values of relative frequency. - lowerreallimit : float - Lower real limit - binsize : float - Width of each bin. - extrapoints : int - Extra points. - - Examples - -------- - >>> import scipy.stats as stats - >>> a = np.array([1, 4, 2, 1, 3, 1]) - >>> relfreqs, lowlim, binsize, extrapoints = stats.relfreq(a, numbins=4) - >>> relfreqs - array([ 0.5 , 0.16666667, 0.16666667, 0.16666667]) - >>> np.sum(relfreqs) # relative frequencies should add up to 1 - 0.99999999999999989 - - """ - h, l, b, e = histogram(a, numbins, defaultreallimits, weights=weights) - h = np.array(h / float(np.array(a).shape[0])) - return h, l, b, e - - -##################################### -###### VARIABILITY FUNCTIONS ##### -##################################### - -def obrientransform(*args): - """ - Computes the O'Brien transform on input data (any number of arrays). - - Used to test for homogeneity of variance prior to running one-way stats. - Each array in ``*args`` is one level of a factor. - If `f_oneway` is run on the transformed data and found significant, - the variances are unequal. From Maxwell and Delaney [1]_, p.112. - - Parameters - ---------- - args : tuple of array_like - Any number of arrays. - - Returns - ------- - obrientransform : ndarray - Transformed data for use in an ANOVA. The first dimension - of the result corresponds to the sequence of transformed - arrays. If the arrays given are all 1-D of the same length, - the return value is a 2-D array; otherwise it is a 1-D array - of type object, with each element being an ndarray. - - References - ---------- - .. [1] S. E. Maxwell and H. D. Delaney, "Designing Experiments and - Analyzing Data: A Model Comparison Perspective", Wadsworth, 1990. - - Examples - -------- - We'll test the following data sets for differences in their variance. - - >>> x = [10, 11, 13, 9, 7, 12, 12, 9, 10] - >>> y = [13, 21, 5, 10, 8, 14, 10, 12, 7, 15] - - Apply the O'Brien transform to the data. - - >>> tx, ty = obrientransform(x, y) - - Use `scipy.stats.f_oneway` to apply a one-way ANOVA test to the - transformed data. - - >>> from scipy.stats import f_oneway - >>> F, p = f_oneway(tx, ty) - >>> p - 0.1314139477040335 - - If we require that ``p < 0.05`` for significance, we cannot conclude - that the variances are different. - """ - TINY = np.sqrt(np.finfo(float).eps) - - # `arrays` will hold the transformed arguments. - arrays = [] - - for arg in args: - a = np.asarray(arg) - n = len(a) - mu = np.mean(a) - sq = (a - mu)**2 - sumsq = sq.sum() - - # The O'Brien transform. - t = ((n - 1.5) * n * sq - 0.5 * sumsq) / ((n - 1) * (n - 2)) - - # Check that the mean of the transformed data is equal to the - # original variance. - var = sumsq / (n - 1) - if abs(var - np.mean(t)) > TINY: - raise ValueError('Lack of convergence in obrientransform.') - - arrays.append(t) - - # If the arrays are not all the same shape, calling np.array(arrays) - # creates a 1-D array with dtype `object` in numpy 1.6+. In numpy - # 1.5.x, it raises an exception. To work around this, we explicitly - # set the dtype to `object` when the arrays are not all the same shape. - if len(arrays) < 2 or all(x.shape == arrays[0].shape for x in arrays[1:]): - dt = None - else: - dt = object - return np.array(arrays, dtype=dt) - - -def signaltonoise(a, axis=0, ddof=0): - """ - The signal-to-noise ratio of the input data. - - Returns the signal-to-noise ratio of `a`, here defined as the mean - divided by the standard deviation. - - Parameters - ---------- - a : array_like - An array_like object containing the sample data. - axis : int or None, optional - If axis is equal to None, the array is first ravel'd. If axis is an - integer, this is the axis over which to operate. Default is 0. - ddof : int, optional - Degrees of freedom correction for standard deviation. Default is 0. - - Returns - ------- - s2n : ndarray - The mean to standard deviation ratio(s) along `axis`, or 0 where the - standard deviation is 0. - - """ - a = np.asanyarray(a) - m = a.mean(axis) - sd = a.std(axis=axis, ddof=ddof) - return np.where(sd == 0, 0, m/sd) - - -def sem(a, axis=0, ddof=1): - """ - Calculates the standard error of the mean (or standard error of - measurement) of the values in the input array. - - Parameters - ---------- - a : array_like - An array containing the values for which the standard error is - returned. - axis : int or None, optional. - If axis is None, ravel `a` first. If axis is an integer, this will be - the axis over which to operate. Defaults to 0. - ddof : int, optional - Delta degrees-of-freedom. How many degrees of freedom to adjust - for bias in limited samples relative to the population estimate - of variance. Defaults to 1. - - Returns - ------- - s : ndarray or float - The standard error of the mean in the sample(s), along the input axis. - - Notes - ----- - The default value for `ddof` is different to the default (0) used by other - ddof containing routines, such as np.std nd stats.nanstd. - - Examples - -------- - Find standard error along the first axis: - - >>> from scipy import stats - >>> a = np.arange(20).reshape(5,4) - >>> stats.sem(a) - array([ 2.8284, 2.8284, 2.8284, 2.8284]) - - Find standard error across the whole array, using n degrees of freedom: - - >>> stats.sem(a, axis=None, ddof=0) - 1.2893796958227628 - - """ - a, axis = _chk_asarray(a, axis) - n = a.shape[axis] - s = np.std(a, axis=axis, ddof=ddof) / np.sqrt(n) - return s - - -def zscore(a, axis=0, ddof=0): - """ - Calculates the z score of each value in the sample, relative to the sample - mean and standard deviation. - - Parameters - ---------- - a : array_like - An array like object containing the sample data. - axis : int or None, optional - If `axis` is equal to None, the array is first raveled. If `axis` is - an integer, this is the axis over which to operate. Default is 0. - ddof : int, optional - Degrees of freedom correction in the calculation of the - standard deviation. Default is 0. - - Returns - ------- - zscore : array_like - The z-scores, standardized by mean and standard deviation of input - array `a`. - - Notes - ----- - This function preserves ndarray subclasses, and works also with - matrices and masked arrays (it uses `asanyarray` instead of `asarray` - for parameters). - - Examples - -------- - >>> a = np.array([ 0.7972, 0.0767, 0.4383, 0.7866, 0.8091, 0.1954, - 0.6307, 0.6599, 0.1065, 0.0508]) - >>> from scipy import stats - >>> stats.zscore(a) - array([ 1.1273, -1.247 , -0.0552, 1.0923, 1.1664, -0.8559, 0.5786, - 0.6748, -1.1488, -1.3324]) - - Computing along a specified axis, using n-1 degrees of freedom (``ddof=1``) - to calculate the standard deviation: - - >>> b = np.array([[ 0.3148, 0.0478, 0.6243, 0.4608], - [ 0.7149, 0.0775, 0.6072, 0.9656], - [ 0.6341, 0.1403, 0.9759, 0.4064], - [ 0.5918, 0.6948, 0.904 , 0.3721], - [ 0.0921, 0.2481, 0.1188, 0.1366]]) - >>> stats.zscore(b, axis=1, ddof=1) - array([[-0.19264823, -1.28415119, 1.07259584, 0.40420358], - [ 0.33048416, -1.37380874, 0.04251374, 1.00081084], - [ 0.26796377, -1.12598418, 1.23283094, -0.37481053], - [-0.22095197, 0.24468594, 1.19042819, -1.21416216], - [-0.82780366, 1.4457416 , -0.43867764, -0.1792603 ]]) - """ - a = np.asanyarray(a) - mns = a.mean(axis=axis) - sstd = a.std(axis=axis, ddof=ddof) - if axis and mns.ndim < a.ndim: - return ((a - np.expand_dims(mns, axis=axis)) / - np.expand_dims(sstd,axis=axis)) - else: - return (a - mns) / sstd - - -def zmap(scores, compare, axis=0, ddof=0): - """ - Calculates the relative z-scores. - - Returns an array of z-scores, i.e., scores that are standardized to zero - mean and unit variance, where mean and variance are calculated from the - comparison array. - - Parameters - ---------- - scores : array_like - The input for which z-scores are calculated. - compare : array_like - The input from which the mean and standard deviation of the - normalization are taken; assumed to have the same dimension as - `scores`. - axis : int or None, optional - Axis over which mean and variance of `compare` are calculated. - Default is 0. - ddof : int, optional - Degrees of freedom correction in the calculation of the - standard deviation. Default is 0. - - Returns - ------- - zscore : array_like - Z-scores, in the same shape as `scores`. - - Notes - ----- - This function preserves ndarray subclasses, and works also with - matrices and masked arrays (it uses `asanyarray` instead of `asarray` - for parameters). - - Examples - -------- - >>> a = [0.5, 2.0, 2.5, 3] - >>> b = [0, 1, 2, 3, 4] - >>> zmap(a, b) - array([-1.06066017, 0. , 0.35355339, 0.70710678]) - """ - scores, compare = map(np.asanyarray, [scores, compare]) - mns = compare.mean(axis=axis) - sstd = compare.std(axis=axis, ddof=ddof) - if axis and mns.ndim < compare.ndim: - return ((scores - np.expand_dims(mns, axis=axis)) / - np.expand_dims(sstd,axis=axis)) - else: - return (scores - mns) / sstd - - -##################################### -####### TRIMMING FUNCTIONS ####### -##################################### - -def threshold(a, threshmin=None, threshmax=None, newval=0): - """ - Clip array to a given value. - - Similar to numpy.clip(), except that values less than `threshmin` or - greater than `threshmax` are replaced by `newval`, instead of by - `threshmin` and `threshmax` respectively. - - Parameters - ---------- - a : array_like - Data to threshold. - threshmin : float, int or None, optional - Minimum threshold, defaults to None. - threshmax : float, int or None, optional - Maximum threshold, defaults to None. - newval : float or int, optional - Value to put in place of values in `a` outside of bounds. - Defaults to 0. - - Returns - ------- - out : ndarray - The clipped input array, with values less than `threshmin` or - greater than `threshmax` replaced with `newval`. - - Examples - -------- - >>> a = np.array([9, 9, 6, 3, 1, 6, 1, 0, 0, 8]) - >>> from scipy import stats - >>> stats.threshold(a, threshmin=2, threshmax=8, newval=-1) - array([-1, -1, 6, 3, -1, 6, -1, -1, -1, 8]) - - """ - a = asarray(a).copy() - mask = zeros(a.shape, dtype=bool) - if threshmin is not None: - mask |= (a < threshmin) - if threshmax is not None: - mask |= (a > threshmax) - a[mask] = newval - return a - - -def sigmaclip(a, low=4., high=4.): - """ - Iterative sigma-clipping of array elements. - - The output array contains only those elements of the input array `c` - that satisfy the conditions :: - - mean(c) - std(c)*low < c < mean(c) + std(c)*high - - Starting from the full sample, all elements outside the critical range are - removed. The iteration continues with a new critical range until no - elements are outside the range. - - Parameters - ---------- - a : array_like - Data array, will be raveled if not 1-D. - low : float, optional - Lower bound factor of sigma clipping. Default is 4. - high : float, optional - Upper bound factor of sigma clipping. Default is 4. - - Returns - ------- - c : ndarray - Input array with clipped elements removed. - critlower : float - Lower threshold value use for clipping. - critlupper : float - Upper threshold value use for clipping. - - Examples - -------- - >>> a = np.concatenate((np.linspace(9.5,10.5,31), np.linspace(0,20,5))) - >>> fact = 1.5 - >>> c, low, upp = sigmaclip(a, fact, fact) - >>> c - array([ 9.96666667, 10. , 10.03333333, 10. ]) - >>> c.var(), c.std() - (0.00055555555555555165, 0.023570226039551501) - >>> low, c.mean() - fact*c.std(), c.min() - (9.9646446609406727, 9.9646446609406727, 9.9666666666666668) - >>> upp, c.mean() + fact*c.std(), c.max() - (10.035355339059327, 10.035355339059327, 10.033333333333333) - - >>> a = np.concatenate((np.linspace(9.5,10.5,11), - np.linspace(-100,-50,3))) - >>> c, low, upp = sigmaclip(a, 1.8, 1.8) - >>> (c == np.linspace(9.5,10.5,11)).all() - True - - """ - c = np.asarray(a).ravel() - delta = 1 - while delta: - c_std = c.std() - c_mean = c.mean() - size = c.size - critlower = c_mean - c_std*low - critupper = c_mean + c_std*high - c = c[(c > critlower) & (c < critupper)] - delta = size-c.size - return c, critlower, critupper - - -def trimboth(a, proportiontocut, axis=0): - """ - Slices off a proportion of items from both ends of an array. - - Slices off the passed proportion of items from both ends of the passed - array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and** - rightmost 10% of scores). You must pre-sort the array if you want - 'proper' trimming. Slices off less if proportion results in a - non-integer slice index (i.e., conservatively slices off - `proportiontocut`). - - Parameters - ---------- - a : array_like - Data to trim. - proportiontocut : float - Proportion (in range 0-1) of total data set to trim of each end. - axis : int or None, optional - Axis along which the observations are trimmed. The default is to trim - along axis=0. If axis is None then the array will be flattened before - trimming. - - Returns - ------- - out : ndarray - Trimmed version of array `a`. - - See Also - -------- - trim_mean - - Examples - -------- - >>> from scipy import stats - >>> a = np.arange(20) - >>> b = stats.trimboth(a, 0.1) - >>> b.shape - (16,) - - """ - a = np.asarray(a) - if axis is None: - a = a.ravel() - axis = 0 - - nobs = a.shape[axis] - lowercut = int(proportiontocut * nobs) - uppercut = nobs - lowercut - if (lowercut >= uppercut): - raise ValueError("Proportion too big.") - - sl = [slice(None)] * a.ndim - sl[axis] = slice(lowercut, uppercut) - return a[sl] - - -def trim1(a, proportiontocut, tail='right'): - """ - Slices off a proportion of items from ONE end of the passed array - distribution. - - If `proportiontocut` = 0.1, slices off 'leftmost' or 'rightmost' - 10% of scores. Slices off LESS if proportion results in a non-integer - slice index (i.e., conservatively slices off `proportiontocut` ). - - Parameters - ---------- - a : array_like - Input array - proportiontocut : float - Fraction to cut off of 'left' or 'right' of distribution - tail : {'left', 'right'}, optional - Defaults to 'right'. - - Returns - ------- - trim1 : ndarray - Trimmed version of array `a` - - """ - a = asarray(a) - if tail.lower() == 'right': - lowercut = 0 - uppercut = len(a) - int(proportiontocut*len(a)) - elif tail.lower() == 'left': - lowercut = int(proportiontocut*len(a)) - uppercut = len(a) - - return a[lowercut:uppercut] - - -def trim_mean(a, proportiontocut, axis=0): - """ - Return mean of array after trimming distribution from both lower and upper - tails. - - If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of - scores. Slices off LESS if proportion results in a non-integer slice - index (i.e., conservatively slices off `proportiontocut` ). - - Parameters - ---------- - a : array_like - Input array - proportiontocut : float - Fraction to cut off of both tails of the distribution - axis : int or None, optional - Axis along which the trimmed means are computed. The default is axis=0. - If axis is None then the trimmed mean will be computed for the - flattened array. - - Returns - ------- - trim_mean : ndarray - Mean of trimmed array. - - See Also - -------- - trimboth - - Examples - -------- - >>> from scipy import stats - >>> x = np.arange(20) - >>> stats.trim_mean(x, 0.1) - 9.5 - >>> x2 = x.reshape(5, 4) - >>> x2 - array([[ 0, 1, 2, 3], - [ 4, 5, 6, 7], - [ 8, 9, 10, 11], - [12, 13, 14, 15], - [16, 17, 18, 19]]) - >>> stats.trim_mean(x2, 0.25) - array([ 8., 9., 10., 11.]) - >>> stats.trim_mean(x2, 0.25, axis=1) - array([ 1.5, 5.5, 9.5, 13.5, 17.5]) - - """ - a = np.asarray(a) - if axis is None: - nobs = a.size - else: - nobs = a.shape[axis] - lowercut = int(proportiontocut * nobs) - uppercut = nobs - lowercut - 1 - if (lowercut > uppercut): - raise ValueError("Proportion too big.") - - try: - atmp = np.partition(a, (lowercut, uppercut), axis) - except AttributeError: - atmp = np.sort(a, axis) - - newa = trimboth(atmp, proportiontocut, axis=axis) - return np.mean(newa, axis=axis) - - -def f_oneway(*args): - """ - Performs a 1-way ANOVA. - - The one-way ANOVA tests the null hypothesis that two or more groups have - the same population mean. The test is applied to samples from two or - more groups, possibly with differing sizes. - - Parameters - ---------- - sample1, sample2, ... : array_like - The sample measurements for each group. - - Returns - ------- - F-value : float - The computed F-value of the test. - p-value : float - The associated p-value from the F-distribution. - - Notes - ----- - The ANOVA test has important assumptions that must be satisfied in order - for the associated p-value to be valid. - - 1. The samples are independent. - 2. Each sample is from a normally distributed population. - 3. The population standard deviations of the groups are all equal. This - property is known as homoscedasticity. - - If these assumptions are not true for a given set of data, it may still be - possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`) although - with some loss of power. - - The algorithm is from Heiman[2], pp.394-7. - - - References - ---------- - .. [1] Lowry, Richard. "Concepts and Applications of Inferential - Statistics". Chapter 14. - http://faculty.vassar.edu/lowry/ch14pt1.html - - .. [2] Heiman, G.W. Research Methods in Statistics. 2002. - - """ - args = [np.asarray(arg, dtype=float) for arg in args] - na = len(args) # ANOVA on 'na' groups, each in it's own array - alldata = np.concatenate(args) - bign = len(alldata) - sstot = ss(alldata) - (square_of_sums(alldata) / float(bign)) - ssbn = 0 - for a in args: - ssbn += square_of_sums(a) / float(len(a)) - - ssbn -= (square_of_sums(alldata) / float(bign)) - sswn = sstot - ssbn - dfbn = na - 1 - dfwn = bign - na - msb = ssbn / float(dfbn) - msw = sswn / float(dfwn) - f = msb / msw - prob = special.fdtrc(dfbn, dfwn, f) # equivalent to stats.f.sf - return f, prob - - -def pearsonr(x, y): - """ - Calculates a Pearson correlation coefficient and the p-value for testing - non-correlation. - - The Pearson correlation coefficient measures the linear relationship - between two datasets. Strictly speaking, Pearson's correlation requires - that each dataset be normally distributed. Like other correlation - coefficients, this one varies between -1 and +1 with 0 implying no - correlation. Correlations of -1 or +1 imply an exact linear - relationship. Positive correlations imply that as x increases, so does - y. Negative correlations imply that as x increases, y decreases. - - The p-value roughly indicates the probability of an uncorrelated system - producing datasets that have a Pearson correlation at least as extreme - as the one computed from these datasets. The p-values are not entirely - reliable but are probably reasonable for datasets larger than 500 or so. - - Parameters - ---------- - x : (N,) array_like - Input - y : (N,) array_like - Input - - Returns - ------- - (Pearson's correlation coefficient, - 2-tailed p-value) - - References - ---------- - http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation - - """ - # x and y should have same length. - x = np.asarray(x) - y = np.asarray(y) - n = len(x) - mx = x.mean() - my = y.mean() - xm, ym = x-mx, y-my - r_num = np.add.reduce(xm * ym) - r_den = np.sqrt(ss(xm) * ss(ym)) - r = r_num / r_den - - # Presumably, if abs(r) > 1, then it is only some small artifact of floating - # point arithmetic. - r = max(min(r, 1.0), -1.0) - df = n-2 - if abs(r) == 1.0: - prob = 0.0 - else: - t_squared = r*r * (df / ((1.0 - r) * (1.0 + r))) - prob = betai(0.5*df, 0.5, df / (df + t_squared)) - return r, prob - - -def fisher_exact(table, alternative='two-sided'): - """Performs a Fisher exact test on a 2x2 contingency table. - - Parameters - ---------- - table : array_like of ints - A 2x2 contingency table. Elements should be non-negative integers. - alternative : {'two-sided', 'less', 'greater'}, optional - Which alternative hypothesis to the null hypothesis the test uses. - Default is 'two-sided'. - - Returns - ------- - oddsratio : float - This is prior odds ratio and not a posterior estimate. - p_value : float - P-value, the probability of obtaining a distribution at least as - extreme as the one that was actually observed, assuming that the - null hypothesis is true. - - See Also - -------- - chi2_contingency : Chi-square test of independence of variables in a - contingency table. - - Notes - ----- - The calculated odds ratio is different from the one R uses. In R language, - this implementation returns the (more common) "unconditional Maximum - Likelihood Estimate", while R uses the "conditional Maximum Likelihood - Estimate". - - For tables with large numbers the (inexact) chi-square test implemented - in the function `chi2_contingency` can also be used. - - Examples - -------- - Say we spend a few days counting whales and sharks in the Atlantic and - Indian oceans. In the Atlantic ocean we find 8 whales and 1 shark, in the - Indian ocean 2 whales and 5 sharks. Then our contingency table is:: - - Atlantic Indian - whales 8 2 - sharks 1 5 - - We use this table to find the p-value: - - >>> oddsratio, pvalue = stats.fisher_exact([[8, 2], [1, 5]]) - >>> pvalue - 0.0349... - - The probability that we would observe this or an even more imbalanced ratio - by chance is about 3.5%. A commonly used significance level is 5%, if we - adopt that we can therefore conclude that our observed imbalance is - statistically significant; whales prefer the Atlantic while sharks prefer - the Indian ocean. - - """ - hypergeom = distributions.hypergeom - c = np.asarray(table, dtype=np.int64) # int32 is not enough for the algorithm - if not c.shape == (2, 2): - raise ValueError("The input `table` must be of shape (2, 2).") - - if np.any(c < 0): - raise ValueError("All values in `table` must be nonnegative.") - - if 0 in c.sum(axis=0) or 0 in c.sum(axis=1): - # If both values in a row or column are zero, the p-value is 1 and - # the odds ratio is NaN. - return np.nan, 1.0 - - if c[1,0] > 0 and c[0,1] > 0: - oddsratio = c[0,0] * c[1,1] / float(c[1,0] * c[0,1]) - else: - oddsratio = np.inf - - n1 = c[0,0] + c[0,1] - n2 = c[1,0] + c[1,1] - n = c[0,0] + c[1,0] - - def binary_search(n, n1, n2, side): - """Binary search for where to begin lower/upper halves in two-sided - test. - """ - if side == "upper": - minval = mode - maxval = n - else: - minval = 0 - maxval = mode - guess = -1 - while maxval - minval > 1: - if maxval == minval + 1 and guess == minval: - guess = maxval - else: - guess = (maxval + minval) // 2 - pguess = hypergeom.pmf(guess, n1 + n2, n1, n) - if side == "upper": - ng = guess - 1 - else: - ng = guess + 1 - if pguess <= pexact and hypergeom.pmf(ng, n1 + n2, n1, n) > pexact: - break - elif pguess < pexact: - maxval = guess - else: - minval = guess - if guess == -1: - guess = minval - if side == "upper": - while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: - guess -= 1 - while hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: - guess += 1 - else: - while hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: - guess += 1 - while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: - guess -= 1 - return guess - - if alternative == 'less': - pvalue = hypergeom.cdf(c[0,0], n1 + n2, n1, n) - elif alternative == 'greater': - # Same formula as the 'less' case, but with the second column. - pvalue = hypergeom.cdf(c[0,1], n1 + n2, n1, c[0,1] + c[1,1]) - elif alternative == 'two-sided': - mode = int(float((n + 1) * (n1 + 1)) / (n1 + n2 + 2)) - pexact = hypergeom.pmf(c[0,0], n1 + n2, n1, n) - pmode = hypergeom.pmf(mode, n1 + n2, n1, n) - - epsilon = 1 - 1e-4 - if np.abs(pexact - pmode) / np.maximum(pexact, pmode) <= 1 - epsilon: - return oddsratio, 1. - - elif c[0,0] < mode: - plower = hypergeom.cdf(c[0,0], n1 + n2, n1, n) - if hypergeom.pmf(n, n1 + n2, n1, n) > pexact / epsilon: - return oddsratio, plower - - guess = binary_search(n, n1, n2, "upper") - pvalue = plower + hypergeom.sf(guess - 1, n1 + n2, n1, n) - else: - pupper = hypergeom.sf(c[0,0] - 1, n1 + n2, n1, n) - if hypergeom.pmf(0, n1 + n2, n1, n) > pexact / epsilon: - return oddsratio, pupper - - guess = binary_search(n, n1, n2, "lower") - pvalue = pupper + hypergeom.cdf(guess, n1 + n2, n1, n) - else: - msg = "`alternative` should be one of {'two-sided', 'less', 'greater'}" - raise ValueError(msg) - - if pvalue > 1.0: - pvalue = 1.0 - return oddsratio, pvalue - - -def spearmanr(a, b=None, axis=0): - """ - Calculates a Spearman rank-order correlation coefficient and the p-value - to test for non-correlation. - - The Spearman correlation is a nonparametric measure of the monotonicity - of the relationship between two datasets. Unlike the Pearson correlation, - the Spearman correlation does not assume that both datasets are normally - distributed. Like other correlation coefficients, this one varies - between -1 and +1 with 0 implying no correlation. Correlations of -1 or - +1 imply an exact monotonic relationship. Positive correlations imply that - as x increases, so does y. Negative correlations imply that as x - increases, y decreases. - - The p-value roughly indicates the probability of an uncorrelated system - producing datasets that have a Spearman correlation at least as extreme - as the one computed from these datasets. The p-values are not entirely - reliable but are probably reasonable for datasets larger than 500 or so. - - Parameters - ---------- - a, b : 1D or 2D array_like, b is optional - One or two 1-D or 2-D arrays containing multiple variables and - observations. Each column of `a` and `b` represents a variable, and - each row entry a single observation of those variables. See also - `axis`. Both arrays need to have the same length in the `axis` - dimension. - axis : int or None, optional - If axis=0 (default), then each column represents a variable, with - observations in the rows. If axis=0, the relationship is transposed: - each row represents a variable, while the columns contain observations. - If axis=None, then both arrays will be raveled. - - Returns - ------- - rho : float or ndarray (2-D square) - Spearman correlation matrix or correlation coefficient (if only 2 - variables are given as parameters. Correlation matrix is square with - length equal to total number of variables (columns or rows) in a and b - combined. - p-value : float - The two-sided p-value for a hypothesis test whose null hypothesis is - that two sets of data are uncorrelated, has same dimension as rho. - - Notes - ----- - Changes in scipy 0.8.0: rewrite to add tie-handling, and axis. - - References - ---------- - [CRCProbStat2000]_ Section 14.7 - - .. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard - Probability and Statistics Tables and Formulae. Chapman & Hall: New - York. 2000. - - Examples - -------- - >>> spearmanr([1,2,3,4,5],[5,6,7,8,7]) - (0.82078268166812329, 0.088587005313543798) - >>> np.random.seed(1234321) - >>> x2n=np.random.randn(100,2) - >>> y2n=np.random.randn(100,2) - >>> spearmanr(x2n) - (0.059969996999699973, 0.55338590803773591) - >>> spearmanr(x2n[:,0], x2n[:,1]) - (0.059969996999699973, 0.55338590803773591) - >>> rho, pval = spearmanr(x2n,y2n) - >>> rho - array([[ 1. , 0.05997 , 0.18569457, 0.06258626], - [ 0.05997 , 1. , 0.110003 , 0.02534653], - [ 0.18569457, 0.110003 , 1. , 0.03488749], - [ 0.06258626, 0.02534653, 0.03488749, 1. ]]) - >>> pval - array([[ 0. , 0.55338591, 0.06435364, 0.53617935], - [ 0.55338591, 0. , 0.27592895, 0.80234077], - [ 0.06435364, 0.27592895, 0. , 0.73039992], - [ 0.53617935, 0.80234077, 0.73039992, 0. ]]) - >>> rho, pval = spearmanr(x2n.T, y2n.T, axis=1) - >>> rho - array([[ 1. , 0.05997 , 0.18569457, 0.06258626], - [ 0.05997 , 1. , 0.110003 , 0.02534653], - [ 0.18569457, 0.110003 , 1. , 0.03488749], - [ 0.06258626, 0.02534653, 0.03488749, 1. ]]) - >>> spearmanr(x2n, y2n, axis=None) - (0.10816770419260482, 0.1273562188027364) - >>> spearmanr(x2n.ravel(), y2n.ravel()) - (0.10816770419260482, 0.1273562188027364) - - >>> xint = np.random.randint(10,size=(100,2)) - >>> spearmanr(xint) - (0.052760927029710199, 0.60213045837062351) - - """ - a, axisout = _chk_asarray(a, axis) - ar = np.apply_along_axis(rankdata,axisout,a) - - br = None - if b is not None: - b, axisout = _chk_asarray(b, axis) - br = np.apply_along_axis(rankdata,axisout,b) - n = a.shape[axisout] - rs = np.corrcoef(ar,br,rowvar=axisout) - - olderr = np.seterr(divide='ignore') # rs can have elements equal to 1 - try: - t = rs * np.sqrt((n-2) / ((rs+1.0)*(1.0-rs))) - finally: - np.seterr(**olderr) - prob = distributions.t.sf(np.abs(t),n-2)*2 - - if rs.shape == (2,2): - return rs[1,0], prob[1,0] - else: - return rs, prob - - -def pointbiserialr(x, y): - """Calculates a point biserial correlation coefficient and the associated - p-value. - - The point biserial correlation is used to measure the relationship - between a binary variable, x, and a continuous variable, y. Like other - correlation coefficients, this one varies between -1 and +1 with 0 - implying no correlation. Correlations of -1 or +1 imply a determinative - relationship. - - This function uses a shortcut formula but produces the same result as - `pearsonr`. - - Parameters - ---------- - x : array_like of bools - Input array. - y : array_like - Input array. - - Returns - ------- - r : float - R value - p-value : float - 2-tailed p-value - - References - ---------- - http://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient - - Examples - -------- - >>> from scipy import stats - >>> a = np.array([0, 0, 0, 1, 1, 1, 1]) - >>> b = np.arange(7) - >>> stats.pointbiserialr(a, b) - (0.8660254037844386, 0.011724811003954652) - >>> stats.pearsonr(a, b) - (0.86602540378443871, 0.011724811003954626) - >>> np.corrcoef(a, b) - array([[ 1. , 0.8660254], - [ 0.8660254, 1. ]]) - - """ - x = np.asarray(x, dtype=bool) - y = np.asarray(y, dtype=float) - n = len(x) - - # phat is the fraction of x values that are True - phat = x.sum() / float(len(x)) - y0 = y[~x] # y-values where x is False - y1 = y[x] # y-values where x is True - y0m = y0.mean() - y1m = y1.mean() - - # phat - phat**2 is more stable than phat*(1-phat) - rpb = (y1m - y0m) * np.sqrt(phat - phat**2) / y.std() - - df = n-2 - # fixme: see comment about TINY in pearsonr() - TINY = 1e-20 - t = rpb*np.sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY))) - prob = betai(0.5*df, 0.5, df/(df+t*t)) - return rpb, prob - - -def kendalltau(x, y, initial_lexsort=True): - """ - Calculates Kendall's tau, a correlation measure for ordinal data. - - Kendall's tau is a measure of the correspondence between two rankings. - Values close to 1 indicate strong agreement, values close to -1 indicate - strong disagreement. This is the tau-b version of Kendall's tau which - accounts for ties. - - Parameters - ---------- - x, y : array_like - Arrays of rankings, of the same shape. If arrays are not 1-D, they will - be flattened to 1-D. - initial_lexsort : bool, optional - Whether to use lexsort or quicksort as the sorting method for the - initial sort of the inputs. Default is lexsort (True), for which - `kendalltau` is of complexity O(n log(n)). If False, the complexity is - O(n^2), but with a smaller pre-factor (so quicksort may be faster for - small arrays). - - Returns - ------- - Kendall's tau : float - The tau statistic. - p-value : float - The two-sided p-value for a hypothesis test whose null hypothesis is - an absence of association, tau = 0. - - Notes - ----- - The definition of Kendall's tau that is used is:: - - tau = (P - Q) / sqrt((P + Q + T) * (P + Q + U)) - - where P is the number of concordant pairs, Q the number of discordant - pairs, T the number of ties only in `x`, and U the number of ties only in - `y`. If a tie occurs for the same pair in both `x` and `y`, it is not - added to either T or U. - - References - ---------- - W.R. Knight, "A Computer Method for Calculating Kendall's Tau with - Ungrouped Data", Journal of the American Statistical Association, Vol. 61, - No. 314, Part 1, pp. 436-439, 1966. - - Examples - -------- - >>> import scipy.stats as stats - >>> x1 = [12, 2, 1, 12, 2] - >>> x2 = [1, 4, 7, 1, 0] - >>> tau, p_value = stats.kendalltau(x1, x2) - >>> tau - -0.47140452079103173 - >>> p_value - 0.24821309157521476 - - """ - - x = np.asarray(x).ravel() - y = np.asarray(y).ravel() - - if not x.size or not y.size: - return (np.nan, np.nan) # Return NaN if arrays are empty - - n = np.int64(len(x)) - temp = list(range(n)) # support structure used by mergesort - # this closure recursively sorts sections of perm[] by comparing - # elements of y[perm[]] using temp[] as support - # returns the number of swaps required by an equivalent bubble sort - - def mergesort(offs, length): - exchcnt = 0 - if length == 1: - return 0 - if length == 2: - if y[perm[offs]] <= y[perm[offs+1]]: - return 0 - t = perm[offs] - perm[offs] = perm[offs+1] - perm[offs+1] = t - return 1 - length0 = length // 2 - length1 = length - length0 - middle = offs + length0 - exchcnt += mergesort(offs, length0) - exchcnt += mergesort(middle, length1) - if y[perm[middle - 1]] < y[perm[middle]]: - return exchcnt - # merging - i = j = k = 0 - while j < length0 or k < length1: - if k >= length1 or (j < length0 and y[perm[offs + j]] <= - y[perm[middle + k]]): - temp[i] = perm[offs + j] - d = i - j - j += 1 - else: - temp[i] = perm[middle + k] - d = (offs + i) - (middle + k) - k += 1 - if d > 0: - exchcnt += d - i += 1 - perm[offs:offs+length] = temp[0:length] - return exchcnt - - # initial sort on values of x and, if tied, on values of y - if initial_lexsort: - # sort implemented as mergesort, worst case: O(n log(n)) - perm = np.lexsort((y, x)) - else: - # sort implemented as quicksort, 30% faster but with worst case: O(n^2) - perm = list(range(n)) - perm.sort(key=lambda a: (x[a], y[a])) - - # compute joint ties - first = 0 - t = 0 - for i in xrange(1, n): - if x[perm[first]] != x[perm[i]] or y[perm[first]] != y[perm[i]]: - t += ((i - first) * (i - first - 1)) // 2 - first = i - t += ((n - first) * (n - first - 1)) // 2 - - # compute ties in x - first = 0 - u = 0 - for i in xrange(1,n): - if x[perm[first]] != x[perm[i]]: - u += ((i - first) * (i - first - 1)) // 2 - first = i - u += ((n - first) * (n - first - 1)) // 2 - - # count exchanges - exchanges = mergesort(0, n) - # compute ties in y after mergesort with counting - first = 0 - v = 0 - for i in xrange(1,n): - if y[perm[first]] != y[perm[i]]: - v += ((i - first) * (i - first - 1)) // 2 - first = i - v += ((n - first) * (n - first - 1)) // 2 - - tot = (n * (n - 1)) // 2 - if tot == u or tot == v: - return (np.nan, np.nan) # Special case for all ties in both ranks - - # Prevent overflow; equal to np.sqrt((tot - u) * (tot - v)) - denom = np.exp(0.5 * (np.log(tot - u) + np.log(tot - v))) - tau = ((tot - (v + u - t)) - 2.0 * exchanges) / denom - - # what follows reproduces the ending of Gary Strangman's original - # stats.kendalltau() in SciPy - svar = (4.0 * n + 10.0) / (9.0 * n * (n - 1)) - z = tau / np.sqrt(svar) - prob = special.erfc(np.abs(z) / 1.4142136) - - return tau, prob - - -def linregress(x, y=None): - """ - Calculate a regression line - - This computes a least-squares regression for two sets of measurements. - - Parameters - ---------- - x, y : array_like - two sets of measurements. Both arrays should have the same length. - If only x is given (and y=None), then it must be a two-dimensional - array where one dimension has length 2. The two sets of measurements - are then found by splitting the array along the length-2 dimension. - - Returns - ------- - slope : float - slope of the regression line - intercept : float - intercept of the regression line - r-value : float - correlation coefficient - p-value : float - two-sided p-value for a hypothesis test whose null hypothesis is - that the slope is zero. - stderr : float - Standard error of the estimate - - - Examples - -------- - >>> from scipy import stats - >>> import numpy as np - >>> x = np.random.random(10) - >>> y = np.random.random(10) - >>> slope, intercept, r_value, p_value, std_err = stats.linregress(x,y) - - # To get coefficient of determination (r_squared) - - >>> print "r-squared:", r_value**2 - r-squared: 0.15286643777 - - """ - TINY = 1.0e-20 - if y is None: # x is a (2, N) or (N, 2) shaped array_like - x = asarray(x) - if x.shape[0] == 2: - x, y = x - elif x.shape[1] == 2: - x, y = x.T - else: - msg = "If only `x` is given as input, it has to be of shape (2, N) \ - or (N, 2), provided shape was %s" % str(x.shape) - raise ValueError(msg) - else: - x = asarray(x) - y = asarray(y) - n = len(x) - xmean = np.mean(x,None) - ymean = np.mean(y,None) - - # average sum of squares: - ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat - r_num = ssxym - r_den = np.sqrt(ssxm*ssym) - if r_den == 0.0: - r = 0.0 - else: - r = r_num / r_den - # test for numerical error propagation - if (r > 1.0): - r = 1.0 - elif (r < -1.0): - r = -1.0 - - df = n-2 - t = r*np.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY))) - prob = distributions.t.sf(np.abs(t),df)*2 - slope = r_num / ssxm - intercept = ymean - slope*xmean - sterrest = np.sqrt((1-r*r)*ssym / ssxm / df) - return slope, intercept, r, prob, sterrest - - -def theilslopes(y, x=None, alpha=0.95): - r""" - Computes the Theil-Sen estimator for a set of points (x, y). - - `theilslopes` implements a method for robust linear regression. It - computes the slope as the median of all slopes between paired values. - - Parameters - ---------- - y : array_like - Dependent variable. - x : {None, array_like}, optional - Independent variable. If None, use ``arange(len(y))`` instead. - alpha : float - Confidence degree between 0 and 1. Default is 95% confidence. - Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are - interpreted as "find the 90% confidence interval". - - Returns - ------- - medslope : float - Theil slope. - medintercept : float - Intercept of the Theil line, as ``median(y) - medslope*median(x)``. - lo_slope : float - Lower bound of the confidence interval on `medslope`. - up_slope : float - Upper bound of the confidence interval on `medslope`. - - Notes - ----- - The implementation of `theilslopes` follows [1]_. The intercept is - not defined in [1]_, and here it is defined as ``median(y) - - medslope*median(x)``, which is given in [3]_. Other definitions of - the intercept exist in the literature. A confidence interval for - the intercept is not given as this question is not addressed in - [1]_. - - References - ---------- - .. [1] P.K. Sen, "Estimates of the regression coefficient based on Kendall's tau", - J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968. - .. [2] H. Theil, "A rank-invariant method of linear and polynomial - regression analysis I, II and III", Nederl. Akad. Wetensch., Proc. - 53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950. - .. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed., - John Wiley and Sons, New York, pp. 493. - - Examples - -------- - >>> from scipy import stats - >>> import matplotlib.pyplot as plt - - >>> x = np.linspace(-5, 5, num=150) - >>> y = x + np.random.normal(size=x.size) - >>> y[11:15] += 10 # add outliers - >>> y[-5:] -= 7 - - Compute the slope, intercept and 90% confidence interval. For comparison, - also compute the least-squares fit with `linregress`: - - >>> res = stats.theilslopes(y, x, 0.90) - >>> lsq_res = stats.linregress(x, y) - - Plot the results. The Theil-Sen regression line is shown in red, with the - dashed red lines illustrating the confidence interval of the slope (note - that the dashed red lines are not the confidence interval of the regression - as the confidence interval of the intercept is not included). The green - line shows the least-squares fit for comparison. - - >>> fig = plt.figure() - >>> ax = fig.add_subplot(111) - >>> ax.plot(x, y, 'b.') - >>> ax.plot(x, res[1] + res[0] * x, 'r-') - >>> ax.plot(x, res[1] + res[2] * x, 'r--') - >>> ax.plot(x, res[1] + res[3] * x, 'r--') - >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-') - >>> plt.show() - - """ - y = np.asarray(y).flatten() - if x is None: - x = np.arange(len(y), dtype=float) - else: - x = np.asarray(x, dtype=float).flatten() - if len(x) != len(y): - raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y),len(x))) - - # Compute sorted slopes only when deltax > 0 - deltax = x[:, np.newaxis] - x - deltay = y[:, np.newaxis] - y - slopes = deltay[deltax > 0] / deltax[deltax > 0] - slopes.sort() - medslope = np.median(slopes) - medinter = np.median(y) - medslope * np.median(x) - # Now compute confidence intervals - if alpha > 0.5: - alpha = 1. - alpha - - z = distributions.norm.ppf(alpha / 2.) - # This implements (2.6) from Sen (1968) - _, nxreps = find_repeats(x) - _, nyreps = find_repeats(y) - nt = len(slopes) # N in Sen (1968) - ny = len(y) # n in Sen (1968) - # Equation 2.6 in Sen (1968): - sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) - - np.sum(k * (k-1) * (2*k + 5) for k in nxreps) - - np.sum(k * (k-1) * (2*k + 5) for k in nyreps)) - # Find the confidence interval indices in `slopes` - sigma = np.sqrt(sigsq) - Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1) - Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0) - delta = slopes[[Rl, Ru]] - return medslope, medinter, delta[0], delta[1] - - -##################################### -##### INFERENTIAL STATISTICS ##### -##################################### - -def ttest_1samp(a, popmean, axis=0): - """ - Calculates the T-test for the mean of ONE group of scores. - - This is a two-sided test for the null hypothesis that the expected value - (mean) of a sample of independent observations `a` is equal to the given - population mean, `popmean`. - - Parameters - ---------- - a : array_like - sample observation - popmean : float or array_like - expected value in null hypothesis, if array_like than it must have the - same shape as `a` excluding the axis dimension - axis : int, optional, (default axis=0) - Axis can equal None (ravel array first), or an integer (the axis - over which to operate on a). - - Returns - ------- - t : float or array - t-statistic - prob : float or array - two-tailed p-value - - Examples - -------- - >>> from scipy import stats - - >>> np.random.seed(7654567) # fix seed to get the same result - >>> rvs = stats.norm.rvs(loc=5, scale=10, size=(50,2)) - - Test if mean of random sample is equal to true mean, and different mean. - We reject the null hypothesis in the second case and don't reject it in - the first case. - - >>> stats.ttest_1samp(rvs,5.0) - (array([-0.68014479, -0.04323899]), array([ 0.49961383, 0.96568674])) - >>> stats.ttest_1samp(rvs,0.0) - (array([ 2.77025808, 4.11038784]), array([ 0.00789095, 0.00014999])) - - Examples using axis and non-scalar dimension for population mean. - - >>> stats.ttest_1samp(rvs,[5.0,0.0]) - (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04])) - >>> stats.ttest_1samp(rvs.T,[5.0,0.0],axis=1) - (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04])) - >>> stats.ttest_1samp(rvs,[[5.0],[0.0]]) - (array([[-0.68014479, -0.04323899], - [ 2.77025808, 4.11038784]]), array([[ 4.99613833e-01, 9.65686743e-01], - [ 7.89094663e-03, 1.49986458e-04]])) - - """ - a, axis = _chk_asarray(a, axis) - n = a.shape[axis] - df = n - 1 - - d = np.mean(a, axis) - popmean - v = np.var(a, axis, ddof=1) - denom = np.sqrt(v / float(n)) - - t = np.divide(d, denom) - t, prob = _ttest_finish(df, t) - - return t, prob - - -def _ttest_finish(df,t): - """Common code between all 3 t-test functions.""" - prob = distributions.t.sf(np.abs(t), df) * 2 # use np.abs to get upper tail - if t.ndim == 0: - t = t[()] - - return t, prob - - -def ttest_ind(a, b, axis=0, equal_var=True): - """ - Calculates the T-test for the means of TWO INDEPENDENT samples of scores. - - This is a two-sided test for the null hypothesis that 2 independent samples - have identical average (expected) values. This test assumes that the - populations have identical variances. - - Parameters - ---------- - a, b : array_like - The arrays must have the same shape, except in the dimension - corresponding to `axis` (the first, by default). - axis : int, optional - Axis can equal None (ravel array first), or an integer (the axis - over which to operate on a and b). - equal_var : bool, optional - If True (default), perform a standard independent 2 sample test - that assumes equal population variances [1]_. - If False, perform Welch's t-test, which does not assume equal - population variance [2]_. - - .. versionadded:: 0.11.0 - - Returns - ------- - t : float or array - The calculated t-statistic. - prob : float or array - The two-tailed p-value. - - Notes - ----- - We can use this test, if we observe two independent samples from - the same or different population, e.g. exam scores of boys and - girls or of two ethnic groups. The test measures whether the - average (expected) value differs significantly across samples. If - we observe a large p-value, for example larger than 0.05 or 0.1, - then we cannot reject the null hypothesis of identical average scores. - If the p-value is smaller than the threshold, e.g. 1%, 5% or 10%, - then we reject the null hypothesis of equal averages. - - References - ---------- - .. [1] http://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test - - .. [2] http://en.wikipedia.org/wiki/Welch%27s_t_test - - Examples - -------- - >>> from scipy import stats - >>> np.random.seed(12345678) - - Test with sample with identical means: - - >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500) - >>> rvs2 = stats.norm.rvs(loc=5,scale=10,size=500) - >>> stats.ttest_ind(rvs1,rvs2) - (0.26833823296239279, 0.78849443369564776) - >>> stats.ttest_ind(rvs1,rvs2, equal_var = False) - (0.26833823296239279, 0.78849452749500748) - - `ttest_ind` underestimates p for unequal variances: - - >>> rvs3 = stats.norm.rvs(loc=5, scale=20, size=500) - >>> stats.ttest_ind(rvs1, rvs3) - (-0.46580283298287162, 0.64145827413436174) - >>> stats.ttest_ind(rvs1, rvs3, equal_var = False) - (-0.46580283298287162, 0.64149646246569292) - - When n1 != n2, the equal variance t-statistic is no longer equal to the - unequal variance t-statistic: - - >>> rvs4 = stats.norm.rvs(loc=5, scale=20, size=100) - >>> stats.ttest_ind(rvs1, rvs4) - (-0.99882539442782481, 0.3182832709103896) - >>> stats.ttest_ind(rvs1, rvs4, equal_var = False) - (-0.69712570584654099, 0.48716927725402048) - - T-test with different means, variance, and n: - - >>> rvs5 = stats.norm.rvs(loc=8, scale=20, size=100) - >>> stats.ttest_ind(rvs1, rvs5) - (-1.4679669854490653, 0.14263895620529152) - >>> stats.ttest_ind(rvs1, rvs5, equal_var = False) - (-0.94365973617132992, 0.34744170334794122) - - """ - a, b, axis = _chk2_asarray(a, b, axis) - if a.size == 0 or b.size == 0: - return (np.nan, np.nan) - - v1 = np.var(a, axis, ddof=1) - v2 = np.var(b, axis, ddof=1) - n1 = a.shape[axis] - n2 = b.shape[axis] - - if (equal_var): - df = n1 + n2 - 2 - svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / float(df) - denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2)) - else: - vn1 = v1 / n1 - vn2 = v2 / n2 - df = ((vn1 + vn2)**2) / ((vn1**2) / (n1 - 1) + (vn2**2) / (n2 - 1)) - - # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0). - # Hence it doesn't matter what df is as long as it's not NaN. - df = np.where(np.isnan(df), 1, df) - denom = np.sqrt(vn1 + vn2) - - d = np.mean(a, axis) - np.mean(b, axis) - t = np.divide(d, denom) - t, prob = _ttest_finish(df, t) - - return t, prob - - -def ttest_rel(a, b, axis=0): - """ - Calculates the T-test on TWO RELATED samples of scores, a and b. - - This is a two-sided test for the null hypothesis that 2 related or - repeated samples have identical average (expected) values. - - Parameters - ---------- - a, b : array_like - The arrays must have the same shape. - axis : int, optional, (default axis=0) - Axis can equal None (ravel array first), or an integer (the axis - over which to operate on a and b). - - Returns - ------- - t : float or array - t-statistic - prob : float or array - two-tailed p-value - - Notes - ----- - Examples for the use are scores of the same set of student in - different exams, or repeated sampling from the same units. The - test measures whether the average score differs significantly - across samples (e.g. exams). If we observe a large p-value, for - example greater than 0.05 or 0.1 then we cannot reject the null - hypothesis of identical average scores. If the p-value is smaller - than the threshold, e.g. 1%, 5% or 10%, then we reject the null - hypothesis of equal averages. Small p-values are associated with - large t-statistics. - - References - ---------- - http://en.wikipedia.org/wiki/T-test#Dependent_t-test - - Examples - -------- - >>> from scipy import stats - >>> np.random.seed(12345678) # fix random seed to get same numbers - - >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500) - >>> rvs2 = (stats.norm.rvs(loc=5,scale=10,size=500) + - ... stats.norm.rvs(scale=0.2,size=500)) - >>> stats.ttest_rel(rvs1,rvs2) - (0.24101764965300962, 0.80964043445811562) - >>> rvs3 = (stats.norm.rvs(loc=8,scale=10,size=500) + - ... stats.norm.rvs(scale=0.2,size=500)) - >>> stats.ttest_rel(rvs1,rvs3) - (-3.9995108708727933, 7.3082402191726459e-005) - - """ - a, b, axis = _chk2_asarray(a, b, axis) - if a.shape[axis] != b.shape[axis]: - raise ValueError('unequal length arrays') - - if a.size == 0 or b.size == 0: - return (np.nan, np.nan) - - n = a.shape[axis] - df = float(n - 1) - - d = (a - b).astype(np.float64) - v = np.var(d, axis, ddof=1) - dm = np.mean(d, axis) - denom = np.sqrt(v / float(n)) - - t = np.divide(dm, denom) - t, prob = _ttest_finish(df, t) - - return t, prob - - -def kstest(rvs, cdf, args=(), N=20, alternative='two-sided', mode='approx'): - """ - Perform the Kolmogorov-Smirnov test for goodness of fit. - - This performs a test of the distribution G(x) of an observed - random variable against a given distribution F(x). Under the null - hypothesis the two distributions are identical, G(x)=F(x). The - alternative hypothesis can be either 'two-sided' (default), 'less' - or 'greater'. The KS test is only valid for continuous distributions. - - Parameters - ---------- - rvs : str, array or callable - If a string, it should be the name of a distribution in `scipy.stats`. - If an array, it should be a 1-D array of observations of random - variables. - If a callable, it should be a function to generate random variables; - it is required to have a keyword argument `size`. - cdf : str or callable - If a string, it should be the name of a distribution in `scipy.stats`. - If `rvs` is a string then `cdf` can be False or the same as `rvs`. - If a callable, that callable is used to calculate the cdf. - args : tuple, sequence, optional - Distribution parameters, used if `rvs` or `cdf` are strings. - N : int, optional - Sample size if `rvs` is string or callable. Default is 20. - alternative : {'two-sided', 'less','greater'}, optional - Defines the alternative hypothesis (see explanation above). - Default is 'two-sided'. - mode : 'approx' (default) or 'asymp', optional - Defines the distribution used for calculating the p-value. - - - 'approx' : use approximation to exact distribution of test statistic - - 'asymp' : use asymptotic distribution of test statistic - - Returns - ------- - D : float - KS test statistic, either D, D+ or D-. - p-value : float - One-tailed or two-tailed p-value. - - Notes - ----- - In the one-sided test, the alternative is that the empirical - cumulative distribution function of the random variable is "less" - or "greater" than the cumulative distribution function F(x) of the - hypothesis, ``G(x)<=F(x)``, resp. ``G(x)>=F(x)``. - - Examples - -------- - >>> from scipy import stats - - >>> x = np.linspace(-15, 15, 9) - >>> stats.kstest(x, 'norm') - (0.44435602715924361, 0.038850142705171065) - - >>> np.random.seed(987654321) # set random seed to get the same result - >>> stats.kstest('norm', False, N=100) - (0.058352892479417884, 0.88531190944151261) - - The above lines are equivalent to: - - >>> np.random.seed(987654321) - >>> stats.kstest(stats.norm.rvs(size=100), 'norm') - (0.058352892479417884, 0.88531190944151261) - - *Test against one-sided alternative hypothesis* - - Shift distribution to larger values, so that ``cdf_dgp(x) < norm.cdf(x)``: - - >>> np.random.seed(987654321) - >>> x = stats.norm.rvs(loc=0.2, size=100) - >>> stats.kstest(x,'norm', alternative = 'less') - (0.12464329735846891, 0.040989164077641749) - - Reject equal distribution against alternative hypothesis: less - - >>> stats.kstest(x,'norm', alternative = 'greater') - (0.0072115233216311081, 0.98531158590396395) - - Don't reject equal distribution against alternative hypothesis: greater - - >>> stats.kstest(x,'norm', mode='asymp') - (0.12464329735846891, 0.08944488871182088) - - *Testing t distributed random variables against normal distribution* - - With 100 degrees of freedom the t distribution looks close to the normal - distribution, and the K-S test does not reject the hypothesis that the - sample came from the normal distribution: - - >>> np.random.seed(987654321) - >>> stats.kstest(stats.t.rvs(100,size=100),'norm') - (0.072018929165471257, 0.67630062862479168) - - With 3 degrees of freedom the t distribution looks sufficiently different - from the normal distribution, that we can reject the hypothesis that the - sample came from the normal distribution at the 10% level: - - >>> np.random.seed(987654321) - >>> stats.kstest(stats.t.rvs(3,size=100),'norm') - (0.131016895759829, 0.058826222555312224) - - """ - if isinstance(rvs, string_types): - if (not cdf) or (cdf == rvs): - cdf = getattr(distributions, rvs).cdf - rvs = getattr(distributions, rvs).rvs - else: - raise AttributeError("if rvs is string, cdf has to be the " - "same distribution") - - if isinstance(cdf, string_types): - cdf = getattr(distributions, cdf).cdf - if callable(rvs): - kwds = {'size':N} - vals = np.sort(rvs(*args,**kwds)) - else: - vals = np.sort(rvs) - N = len(vals) - cdfvals = cdf(vals, *args) - - # to not break compatibility with existing code - if alternative == 'two_sided': - alternative = 'two-sided' - - if alternative in ['two-sided', 'greater']: - Dplus = (np.arange(1.0, N+1)/N - cdfvals).max() - if alternative == 'greater': - return Dplus, distributions.ksone.sf(Dplus,N) - - if alternative in ['two-sided', 'less']: - Dmin = (cdfvals - np.arange(0.0, N)/N).max() - if alternative == 'less': - return Dmin, distributions.ksone.sf(Dmin,N) - - if alternative == 'two-sided': - D = np.max([Dplus,Dmin]) - if mode == 'asymp': - return D, distributions.kstwobign.sf(D*np.sqrt(N)) - if mode == 'approx': - pval_two = distributions.kstwobign.sf(D*np.sqrt(N)) - if N > 2666 or pval_two > 0.80 - N*0.3/1000.0: - return D, distributions.kstwobign.sf(D*np.sqrt(N)) - else: - return D, distributions.ksone.sf(D,N)*2 - - -# Map from names to lambda_ values used in power_divergence(). -_power_div_lambda_names = { - "pearson": 1, - "log-likelihood": 0, - "freeman-tukey": -0.5, - "mod-log-likelihood": -1, - "neyman": -2, - "cressie-read": 2/3, -} - - -def _count(a, axis=None): - """ - Count the number of non-masked elements of an array. - - This function behaves like np.ma.count(), but is much faster - for ndarrays. - """ - if hasattr(a, 'count'): - num = a.count(axis=axis) - if isinstance(num, np.ndarray) and num.ndim == 0: - # In some cases, the `count` method returns a scalar array (e.g. - # np.array(3)), but we want a plain integer. - num = int(num) - else: - if axis is None: - num = a.size - else: - num = a.shape[axis] - return num - - -def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None): - """ - Cressie-Read power divergence statistic and goodness of fit test. - - This function tests the null hypothesis that the categorical data - has the given frequencies, using the Cressie-Read power divergence - statistic. - - Parameters - ---------- - f_obs : array_like - Observed frequencies in each category. - f_exp : array_like, optional - Expected frequencies in each category. By default the categories are - assumed to be equally likely. - ddof : int, optional - "Delta degrees of freedom": adjustment to the degrees of freedom - for the p-value. The p-value is computed using a chi-squared - distribution with ``k - 1 - ddof`` degrees of freedom, where `k` - is the number of observed frequencies. The default value of `ddof` - is 0. - axis : int or None, optional - The axis of the broadcast result of `f_obs` and `f_exp` along which to - apply the test. If axis is None, all values in `f_obs` are treated - as a single data set. Default is 0. - lambda_ : float or str, optional - `lambda_` gives the power in the Cressie-Read power divergence - statistic. The default is 1. For convenience, `lambda_` may be - assigned one of the following strings, in which case the - corresponding numerical value is used:: - - String Value Description - "pearson" 1 Pearson's chi-squared statistic. - In this case, the function is - equivalent to `stats.chisquare`. - "log-likelihood" 0 Log-likelihood ratio. Also known as - the G-test [3]_. - "freeman-tukey" -1/2 Freeman-Tukey statistic. - "mod-log-likelihood" -1 Modified log-likelihood ratio. - "neyman" -2 Neyman's statistic. - "cressie-read" 2/3 The power recommended in [5]_. - - Returns - ------- - stat : float or ndarray - The Cressie-Read power divergence test statistic. The value is - a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D. - p : float or ndarray - The p-value of the test. The value is a float if `ddof` and the - return value `stat` are scalars. - - See Also - -------- - chisquare - - Notes - ----- - This test is invalid when the observed or expected frequencies in each - category are too small. A typical rule is that all of the observed - and expected frequencies should be at least 5. - - When `lambda_` is less than zero, the formula for the statistic involves - dividing by `f_obs`, so a warning or error may be generated if any value - in `f_obs` is 0. - - Similarly, a warning or error may be generated if any value in `f_exp` is - zero when `lambda_` >= 0. - - The default degrees of freedom, k-1, are for the case when no parameters - of the distribution are estimated. If p parameters are estimated by - efficient maximum likelihood then the correct degrees of freedom are - k-1-p. If the parameters are estimated in a different way, then the - dof can be between k-1-p and k-1. However, it is also possible that - the asymptotic distribution is not a chisquare, in which case this - test is not appropriate. - - This function handles masked arrays. If an element of `f_obs` or `f_exp` - is masked, then data at that position is ignored, and does not count - towards the size of the data set. - - .. versionadded:: 0.13.0 - - References - ---------- - .. [1] Lowry, Richard. "Concepts and Applications of Inferential - Statistics". Chapter 8. http://faculty.vassar.edu/lowry/ch8pt1.html - .. [2] "Chi-squared test", http://en.wikipedia.org/wiki/Chi-squared_test - .. [3] "G-test", http://en.wikipedia.org/wiki/G-test - .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and - practice of statistics in biological research", New York: Freeman - (1981) - .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit - Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984), - pp. 440-464. - - Examples - -------- - - (See `chisquare` for more examples.) - - When just `f_obs` is given, it is assumed that the expected frequencies - are uniform and given by the mean of the observed frequencies. Here we - perform a G-test (i.e. use the log-likelihood ratio statistic): - - >>> power_divergence([16, 18, 16, 14, 12, 12], lambda_='log-likelihood') - (2.006573162632538, 0.84823476779463769) - - The expected frequencies can be given with the `f_exp` argument: - - >>> power_divergence([16, 18, 16, 14, 12, 12], - ... f_exp=[16, 16, 16, 16, 16, 8], - ... lambda_='log-likelihood') - (3.5, 0.62338762774958223) - - When `f_obs` is 2-D, by default the test is applied to each column. - - >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T - >>> obs.shape - (6, 2) - >>> power_divergence(obs, lambda_="log-likelihood") - (array([ 2.00657316, 6.77634498]), array([ 0.84823477, 0.23781225])) - - By setting ``axis=None``, the test is applied to all data in the array, - which is equivalent to applying the test to the flattened array. - - >>> power_divergence(obs, axis=None) - (23.31034482758621, 0.015975692534127565) - >>> power_divergence(obs.ravel()) - (23.31034482758621, 0.015975692534127565) - - `ddof` is the change to make to the default degrees of freedom. - - >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1) - (2.0, 0.73575888234288467) - - The calculation of the p-values is done by broadcasting the - test statistic with `ddof`. - - >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=[0,1,2]) - (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ])) - - `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has - shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting - `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared - statistics, we must use ``axis=1``: - - >>> power_divergence([16, 18, 16, 14, 12, 12], - ... f_exp=[[16, 16, 16, 16, 16, 8], - ... [8, 20, 20, 16, 12, 12]], - ... axis=1) - (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) - - """ - # Convert the input argument `lambda_` to a numerical value. - if isinstance(lambda_, string_types): - if lambda_ not in _power_div_lambda_names: - names = repr(list(_power_div_lambda_names.keys()))[1:-1] - raise ValueError("invalid string for lambda_: {0!r}. Valid strings " - "are {1}".format(lambda_, names)) - lambda_ = _power_div_lambda_names[lambda_] - elif lambda_ is None: - lambda_ = 1 - - f_obs = np.asanyarray(f_obs) - - if f_exp is not None: - f_exp = np.atleast_1d(np.asanyarray(f_exp)) - else: - # Compute the equivalent of - # f_exp = f_obs.mean(axis=axis, keepdims=True) - # Older versions of numpy do not have the 'keepdims' argument, so - # we have to do a little work to achieve the same result. - # Ignore 'invalid' errors so the edge case of a data set with length 0 - # is handled without spurious warnings. - with np.errstate(invalid='ignore'): - f_exp = np.atleast_1d(f_obs.mean(axis=axis)) - if axis is not None: - reduced_shape = list(f_obs.shape) - reduced_shape[axis] = 1 - f_exp.shape = reduced_shape - - # `terms` is the array of terms that are summed along `axis` to create - # the test statistic. We use some specialized code for a few special - # cases of lambda_. - if lambda_ == 1: - # Pearson's chi-squared statistic - terms = (f_obs - f_exp)**2 / f_exp - elif lambda_ == 0: - # Log-likelihood ratio (i.e. G-test) - terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp) - elif lambda_ == -1: - # Modified log-likelihood ratio - terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs) - else: - # General Cressie-Read power divergence. - terms = f_obs * ((f_obs / f_exp)**lambda_ - 1) - terms /= 0.5 * lambda_ * (lambda_ + 1) - - stat = terms.sum(axis=axis) - - num_obs = _count(terms, axis=axis) - ddof = asarray(ddof) - p = chisqprob(stat, num_obs - 1 - ddof) - - return stat, p - - -def chisquare(f_obs, f_exp=None, ddof=0, axis=0): - """ - Calculates a one-way chi square test. - - The chi square test tests the null hypothesis that the categorical data - has the given frequencies. - - Parameters - ---------- - f_obs : array_like - Observed frequencies in each category. - f_exp : array_like, optional - Expected frequencies in each category. By default the categories are - assumed to be equally likely. - ddof : int, optional - "Delta degrees of freedom": adjustment to the degrees of freedom - for the p-value. The p-value is computed using a chi-squared - distribution with ``k - 1 - ddof`` degrees of freedom, where `k` - is the number of observed frequencies. The default value of `ddof` - is 0. - axis : int or None, optional - The axis of the broadcast result of `f_obs` and `f_exp` along which to - apply the test. If axis is None, all values in `f_obs` are treated - as a single data set. Default is 0. - - Returns - ------- - chisq : float or ndarray - The chi-squared test statistic. The value is a float if `axis` is - None or `f_obs` and `f_exp` are 1-D. - p : float or ndarray - The p-value of the test. The value is a float if `ddof` and the - return value `chisq` are scalars. - - See Also - -------- - power_divergence - mstats.chisquare - - Notes - ----- - This test is invalid when the observed or expected frequencies in each - category are too small. A typical rule is that all of the observed - and expected frequencies should be at least 5. - - The default degrees of freedom, k-1, are for the case when no parameters - of the distribution are estimated. If p parameters are estimated by - efficient maximum likelihood then the correct degrees of freedom are - k-1-p. If the parameters are estimated in a different way, then the - dof can be between k-1-p and k-1. However, it is also possible that - the asymptotic distribution is not a chisquare, in which case this - test is not appropriate. - - References - ---------- - .. [1] Lowry, Richard. "Concepts and Applications of Inferential - Statistics". Chapter 8. http://faculty.vassar.edu/lowry/ch8pt1.html - .. [2] "Chi-squared test", http://en.wikipedia.org/wiki/Chi-squared_test - - Examples - -------- - When just `f_obs` is given, it is assumed that the expected frequencies - are uniform and given by the mean of the observed frequencies. - - >>> chisquare([16, 18, 16, 14, 12, 12]) - (2.0, 0.84914503608460956) - - With `f_exp` the expected frequencies can be given. - - >>> chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8]) - (3.5, 0.62338762774958223) - - When `f_obs` is 2-D, by default the test is applied to each column. - - >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T - >>> obs.shape - (6, 2) - >>> chisquare(obs) - (array([ 2. , 6.66666667]), array([ 0.84914504, 0.24663415])) - - By setting ``axis=None``, the test is applied to all data in the array, - which is equivalent to applying the test to the flattened array. - - >>> chisquare(obs, axis=None) - (23.31034482758621, 0.015975692534127565) - >>> chisquare(obs.ravel()) - (23.31034482758621, 0.015975692534127565) - - `ddof` is the change to make to the default degrees of freedom. - - >>> chisquare([16, 18, 16, 14, 12, 12], ddof=1) - (2.0, 0.73575888234288467) - - The calculation of the p-values is done by broadcasting the - chi-squared statistic with `ddof`. - - >>> chisquare([16, 18, 16, 14, 12, 12], ddof=[0,1,2]) - (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ])) - - `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has - shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting - `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared - statistics, we use ``axis=1``: - - >>> chisquare([16, 18, 16, 14, 12, 12], - ... f_exp=[[16, 16, 16, 16, 16, 8], [8, 20, 20, 16, 12, 12]], - ... axis=1) - (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) - - """ - return power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis, - lambda_="pearson") - - -def ks_2samp(data1, data2): - """ - Computes the Kolmogorov-Smirnov statistic on 2 samples. - - This is a two-sided test for the null hypothesis that 2 independent samples - are drawn from the same continuous distribution. - - Parameters - ---------- - a, b : sequence of 1-D ndarrays - two arrays of sample observations assumed to be drawn from a continuous - distribution, sample sizes can be different - - Returns - ------- - D : float - KS statistic - p-value : float - two-tailed p-value - - Notes - ----- - This tests whether 2 samples are drawn from the same distribution. Note - that, like in the case of the one-sample K-S test, the distribution is - assumed to be continuous. - - This is the two-sided test, one-sided tests are not implemented. - The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution. - - If the K-S statistic is small or the p-value is high, then we cannot - reject the hypothesis that the distributions of the two samples - are the same. - - Examples - -------- - >>> from scipy import stats - >>> np.random.seed(12345678) #fix random seed to get the same result - >>> n1 = 200 # size of first sample - >>> n2 = 300 # size of second sample - - For a different distribution, we can reject the null hypothesis since the - pvalue is below 1%: - - >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1) - >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5) - >>> stats.ks_2samp(rvs1, rvs2) - (0.20833333333333337, 4.6674975515806989e-005) - - For a slightly different distribution, we cannot reject the null hypothesis - at a 10% or lower alpha since the p-value at 0.144 is higher than 10% - - >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0) - >>> stats.ks_2samp(rvs1, rvs3) - (0.10333333333333333, 0.14498781825751686) - - For an identical distribution, we cannot reject the null hypothesis since - the p-value is high, 41%: - - >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0) - >>> stats.ks_2samp(rvs1, rvs4) - (0.07999999999999996, 0.41126949729859719) - - """ - data1, data2 = map(asarray, (data1, data2)) - n1 = data1.shape[0] - n2 = data2.shape[0] - n1 = len(data1) - n2 = len(data2) - data1 = np.sort(data1) - data2 = np.sort(data2) - data_all = np.concatenate([data1,data2]) - cdf1 = np.searchsorted(data1,data_all,side='right')/(1.0*n1) - cdf2 = (np.searchsorted(data2,data_all,side='right'))/(1.0*n2) - d = np.max(np.absolute(cdf1-cdf2)) - # Note: d absolute not signed distance - en = np.sqrt(n1*n2/float(n1+n2)) - try: - prob = distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * d) - except: - prob = 1.0 - return d, prob - - -def mannwhitneyu(x, y, use_continuity=True): - """ - Computes the Mann-Whitney rank test on samples x and y. - - Parameters - ---------- - x, y : array_like - Array of samples, should be one-dimensional. - use_continuity : bool, optional - Whether a continuity correction (1/2.) should be taken into - account. Default is True. - - Returns - ------- - u : float - The Mann-Whitney statistics. - prob : float - One-sided p-value assuming a asymptotic normal distribution. - - Notes - ----- - Use only when the number of observation in each sample is > 20 and - you have 2 independent samples of ranks. Mann-Whitney U is - significant if the u-obtained is LESS THAN or equal to the critical - value of U. - - This test corrects for ties and by default uses a continuity correction. - The reported p-value is for a one-sided hypothesis, to get the two-sided - p-value multiply the returned p-value by 2. - - """ - x = asarray(x) - y = asarray(y) - n1 = len(x) - n2 = len(y) - ranked = rankdata(np.concatenate((x,y))) - rankx = ranked[0:n1] # get the x-ranks - u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx,axis=0) # calc U for x - u2 = n1*n2 - u1 # remainder is U for y - bigu = max(u1,u2) - smallu = min(u1,u2) - T = tiecorrect(ranked) - if T == 0: - raise ValueError('All numbers are identical in amannwhitneyu') - sd = np.sqrt(T*n1*n2*(n1+n2+1)/12.0) - - if use_continuity: - # normal approximation for prob calc with continuity correction - z = abs((bigu-0.5-n1*n2/2.0) / sd) - else: - z = abs((bigu-n1*n2/2.0) / sd) # normal approximation for prob calc - return smallu, distributions.norm.sf(z) # (1.0 - zprob(z)) - - -def ranksums(x, y): - """ - Compute the Wilcoxon rank-sum statistic for two samples. - - The Wilcoxon rank-sum test tests the null hypothesis that two sets - of measurements are drawn from the same distribution. The alternative - hypothesis is that values in one sample are more likely to be - larger than the values in the other sample. - - This test should be used to compare two samples from continuous - distributions. It does not handle ties between measurements - in x and y. For tie-handling and an optional continuity correction - see `scipy.stats.mannwhitneyu`. - - Parameters - ---------- - x,y : array_like - The data from the two samples - - Returns - ------- - z-statistic : float - The test statistic under the large-sample approximation that the - rank sum statistic is normally distributed - p-value : float - The two-sided p-value of the test - - References - ---------- - .. [1] http://en.wikipedia.org/wiki/Wilcoxon_rank-sum_test - - """ - x,y = map(np.asarray, (x, y)) - n1 = len(x) - n2 = len(y) - alldata = np.concatenate((x,y)) - ranked = rankdata(alldata) - x = ranked[:n1] - y = ranked[n1:] - s = np.sum(x,axis=0) - expected = n1*(n1+n2+1) / 2.0 - z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0) - prob = 2 * distributions.norm.sf(abs(z)) - return z, prob - - -def kruskal(*args): - """ - Compute the Kruskal-Wallis H-test for independent samples - - The Kruskal-Wallis H-test tests the null hypothesis that the population - median of all of the groups are equal. It is a non-parametric version of - ANOVA. The test works on 2 or more independent samples, which may have - different sizes. Note that rejecting the null hypothesis does not - indicate which of the groups differs. Post-hoc comparisons between - groups are required to determine which groups are different. - - Parameters - ---------- - sample1, sample2, ... : array_like - Two or more arrays with the sample measurements can be given as - arguments. - - Returns - ------- - H-statistic : float - The Kruskal-Wallis H statistic, corrected for ties - p-value : float - The p-value for the test using the assumption that H has a chi - square distribution - - Notes - ----- - Due to the assumption that H has a chi square distribution, the number - of samples in each group must not be too small. A typical rule is - that each sample must have at least 5 measurements. - - References - ---------- - .. [1] http://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance - - """ - args = list(map(np.asarray, args)) # convert to a numpy array - na = len(args) # Kruskal-Wallis on 'na' groups, each in it's own array - if na < 2: - raise ValueError("Need at least two groups in stats.kruskal()") - n = np.asarray(list(map(len, args))) - - alldata = np.concatenate(args) - - ranked = rankdata(alldata) # Rank the data - T = tiecorrect(ranked) # Correct for ties - if T == 0: - raise ValueError('All numbers are identical in kruskal') - - # Compute sum^2/n for each group and sum - j = np.insert(np.cumsum(n), 0, 0) - ssbn = 0 - for i in range(na): - ssbn += square_of_sums(ranked[j[i]:j[i+1]]) / float(n[i]) - - totaln = np.sum(n) - h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1) - df = na - 1 - h = h / float(T) - return h, chisqprob(h, df) - - -def friedmanchisquare(*args): - """ - Computes the Friedman test for repeated measurements - - The Friedman test tests the null hypothesis that repeated measurements of - the same individuals have the same distribution. It is often used - to test for consistency among measurements obtained in different ways. - For example, if two measurement techniques are used on the same set of - individuals, the Friedman test can be used to determine if the two - measurement techniques are consistent. - - Parameters - ---------- - measurements1, measurements2, measurements3... : array_like - Arrays of measurements. All of the arrays must have the same number - of elements. At least 3 sets of measurements must be given. - - Returns - ------- - friedman chi-square statistic : float - the test statistic, correcting for ties - p-value : float - the associated p-value assuming that the test statistic has a chi - squared distribution - - Notes - ----- - Due to the assumption that the test statistic has a chi squared - distribution, the p-value is only reliable for n > 10 and more than - 6 repeated measurements. - - References - ---------- - .. [1] http://en.wikipedia.org/wiki/Friedman_test - - """ - k = len(args) - if k < 3: - raise ValueError('\nLess than 3 levels. Friedman test not appropriate.\n') - - n = len(args[0]) - for i in range(1, k): - if len(args[i]) != n: - raise ValueError('Unequal N in friedmanchisquare. Aborting.') - - # Rank data - data = np.vstack(args).T - data = data.astype(float) - for i in range(len(data)): - data[i] = rankdata(data[i]) - - # Handle ties - ties = 0 - for i in range(len(data)): - replist, repnum = find_repeats(array(data[i])) - for t in repnum: - ties += t*(t*t-1) - c = 1 - ties / float(k*(k*k-1)*n) - - ssbn = pysum(pysum(data)**2) - chisq = (12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)) / c - return chisq, chisqprob(chisq,k-1) - - -##################################### -#### PROBABILITY CALCULATIONS #### -##################################### - -zprob = np.deprecate(message='zprob is deprecated in scipy 0.14, ' - 'use norm.cdf or special.ndtr instead\n', - old_name='zprob')(special.ndtr) - - -def chisqprob(chisq, df): - """ - Probability value (1-tail) for the Chi^2 probability distribution. - - Broadcasting rules apply. - - Parameters - ---------- - chisq : array_like or float > 0 - - df : array_like or float, probably int >= 1 - - Returns - ------- - chisqprob : ndarray - The area from `chisq` to infinity under the Chi^2 probability - distribution with degrees of freedom `df`. - - """ - return special.chdtrc(df,chisq) - -ksprob = np.deprecate(message='ksprob is deprecated in scipy 0.14, ' - 'use stats.kstwobign.sf or special.kolmogorov instead\n', - old_name='ksprob')(special.kolmogorov) - -fprob = np.deprecate(message='fprob is deprecated in scipy 0.14, ' - 'use stats.f.sf or special.fdtrc instead\n', - old_name='fprob')(special.fdtrc) - - -def betai(a, b, x): - """ - Returns the incomplete beta function. - - I_x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt) - - where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma - function of a. - - The standard broadcasting rules apply to a, b, and x. - - Parameters - ---------- - a : array_like or float > 0 - - b : array_like or float > 0 - - x : array_like or float - x will be clipped to be no greater than 1.0 . - - Returns - ------- - betai : ndarray - Incomplete beta function. - - """ - x = np.asarray(x) - x = np.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0 - return special.betainc(a, b, x) - - -##################################### -####### ANOVA CALCULATIONS ####### -##################################### - -def f_value_wilks_lambda(ER, EF, dfnum, dfden, a, b): - """Calculation of Wilks lambda F-statistic for multivarite data, per - Maxwell & Delaney p.657. - """ - if isinstance(ER, (int, float)): - ER = array([[ER]]) - if isinstance(EF, (int, float)): - EF = array([[EF]]) - lmbda = linalg.det(EF) / linalg.det(ER) - if (a-1)**2 + (b-1)**2 == 5: - q = 1 - else: - q = np.sqrt(((a-1)**2*(b-1)**2 - 2) / ((a-1)**2 + (b-1)**2 - 5)) - n_um = (1 - lmbda**(1.0/q))*(a-1)*(b-1) - d_en = lmbda**(1.0/q) / (n_um*q - 0.5*(a-1)*(b-1) + 1) - return n_um / d_en - - -def f_value(ER, EF, dfR, dfF): - """ - Returns an F-statistic for a restricted vs. unrestricted model. - - Parameters - ---------- - ER : float - `ER` is the sum of squared residuals for the restricted model - or null hypothesis - - EF : float - `EF` is the sum of squared residuals for the unrestricted model - or alternate hypothesis - - dfR : int - `dfR` is the degrees of freedom in the restricted model - - dfF : int - `dfF` is the degrees of freedom in the unrestricted model - - Returns - ------- - F-statistic : float - - """ - return ((ER-EF)/float(dfR-dfF) / (EF/float(dfF))) - - -def f_value_multivariate(ER, EF, dfnum, dfden): - """ - Returns a multivariate F-statistic. - - Parameters - ---------- - ER : ndarray - Error associated with the null hypothesis (the Restricted model). - From a multivariate F calculation. - EF : ndarray - Error associated with the alternate hypothesis (the Full model) - From a multivariate F calculation. - dfnum : int - Degrees of freedom the Restricted model. - dfden : int - Degrees of freedom associated with the Restricted model. - - Returns - ------- - fstat : float - The computed F-statistic. - - """ - if isinstance(ER, (int, float)): - ER = array([[ER]]) - if isinstance(EF, (int, float)): - EF = array([[EF]]) - n_um = (linalg.det(ER) - linalg.det(EF)) / float(dfnum) - d_en = linalg.det(EF) / float(dfden) - return n_um / d_en - - -##################################### -####### SUPPORT FUNCTIONS ######## -##################################### - -def ss(a, axis=0): - """ - Squares each element of the input array, and returns the sum(s) of that. - - Parameters - ---------- - a : array_like - Input array. - axis : int or None, optional - The axis along which to calculate. If None, use whole array. - Default is 0, i.e. along the first axis. - - Returns - ------- - ss : ndarray - The sum along the given axis for (a**2). - - See also - -------- - square_of_sums : The square(s) of the sum(s) (the opposite of `ss`). - - Examples - -------- - >>> from scipy import stats - >>> a = np.array([1., 2., 5.]) - >>> stats.ss(a) - 30.0 - - And calculating along an axis: - - >>> b = np.array([[1., 2., 5.], [2., 5., 6.]]) - >>> stats.ss(b, axis=1) - array([ 30., 65.]) - - """ - a, axis = _chk_asarray(a, axis) - return np.sum(a*a, axis) - - -def square_of_sums(a, axis=0): - """ - Sums elements of the input array, and returns the square(s) of that sum. - - Parameters - ---------- - a : array_like - Input array. - axis : int or None, optional - If axis is None, ravel `a` first. If `axis` is an integer, this will - be the axis over which to operate. Defaults to 0. - - Returns - ------- - square_of_sums : float or ndarray - The square of the sum over `axis`. - - See also - -------- - ss : The sum of squares (the opposite of `square_of_sums`). - - Examples - -------- - >>> from scipy import stats - >>> a = np.arange(20).reshape(5,4) - >>> stats.square_of_sums(a) - array([ 1600., 2025., 2500., 3025.]) - >>> stats.square_of_sums(a, axis=None) - 36100.0 - - """ - a, axis = _chk_asarray(a, axis) - s = np.sum(a,axis) - if not np.isscalar(s): - return s.astype(float)*s - else: - return float(s)*s - - -def fastsort(a): - """ - Sort an array and provide the argsort. - - Parameters - ---------- - a : array_like - Input array. - - Returns - ------- - fastsort : ndarray of type int - sorted indices into the original array - - """ - # TODO: the wording in the docstring is nonsense. - it = np.argsort(a) - as_ = a[it] - return as_, it diff --git a/wafo/stats/tests/common_tests.py b/wafo/stats/tests/common_tests.py index 3ddf534..c4c0129 100644 --- a/wafo/stats/tests/common_tests.py +++ b/wafo/stats/tests/common_tests.py @@ -1,16 +1,23 @@ from __future__ import division, print_function, absolute_import -import inspect import warnings +import pickle import numpy as np import numpy.testing as npt +from numpy.testing import assert_allclose +import numpy.ma.testutils as ma_npt -from scipy._lib._version import NumpyVersion +from wafo.stats._util import getargspec_no_self as _getargspec from wafo import stats -NUMPY_BELOW_1_7 = NumpyVersion(np.__version__) < '1.7.0' +def check_named_results(res, attributes, ma=False): + for i, attr in enumerate(attributes): + if ma: + ma_npt.assert_equal(res[i], getattr(res, attr)) + else: + npt.assert_equal(res[i], getattr(res, attr)) def check_normalization(distfn, args, distname): @@ -94,17 +101,19 @@ def check_private_entropy(distfn, args, superclass): def check_edge_support(distfn, args): - # Make sure the x=self.a and self.b are handled correctly. + # Make sure that x=self.a and self.b are handled correctly. x = [distfn.a, distfn.b] - if isinstance(distfn, stats.rv_continuous): - npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0]) - npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0]) + if isinstance(distfn, stats.rv_discrete): + x = [distfn.a - 1, distfn.b] + + npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0]) + npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0]) - npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0]) + if distfn.name not in ('skellam', 'dlaplace'): + # with a = -inf, log(0) generates warnings + npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0]) npt.assert_equal(distfn.logsf(x, *args), [0.0, -np.inf]) - if isinstance(distfn, stats.rv_discrete): - x = [distfn.a-1, distfn.b] npt.assert_equal(distfn.ppf([0.0, 1.0], *args), x) npt.assert_equal(distfn.isf([0.0, 1.0], *args), x[::-1]) @@ -117,12 +126,12 @@ def check_named_args(distfn, x, shape_args, defaults, meths): ## Check calling w/ named arguments. # check consistency of shapes, numargs and _parse signature - signature = inspect.getargspec(distfn._parse_args) + signature = _getargspec(distfn._parse_args) npt.assert_(signature.varargs is None) npt.assert_(signature.keywords is None) - npt.assert_(signature.defaults == defaults) + npt.assert_(list(signature.defaults) == list(defaults)) - shape_argnames = signature.args[1:-len(defaults)] # self, a, b, loc=0, scale=1 + shape_argnames = signature.args[:-len(defaults)] # a, b, loc=0, scale=1 if distfn.shapes: shapes_ = distfn.shapes.replace(',', ' ').split() else: @@ -152,3 +161,115 @@ def check_named_args(distfn, x, shape_args, defaults, meths): k.update({'kaboom': 42}) npt.assert_raises(TypeError, distfn.cdf, x, **k) + +def check_random_state_property(distfn, args): + # check the random_state attribute of a distribution *instance* + + # This test fiddles with distfn.random_state. This breaks other tests, + # hence need to save it and then restore. + rndm = distfn.random_state + + # baseline: this relies on the global state + np.random.seed(1234) + distfn.random_state = None + r0 = distfn.rvs(*args, size=8) + + # use an explicit instance-level random_state + distfn.random_state = 1234 + r1 = distfn.rvs(*args, size=8) + npt.assert_equal(r0, r1) + + distfn.random_state = np.random.RandomState(1234) + r2 = distfn.rvs(*args, size=8) + npt.assert_equal(r0, r2) + + # can override the instance-level random_state for an individual .rvs call + distfn.random_state = 2 + orig_state = distfn.random_state.get_state() + + r3 = distfn.rvs(*args, size=8, random_state=np.random.RandomState(1234)) + npt.assert_equal(r0, r3) + + # ... and that does not alter the instance-level random_state! + npt.assert_equal(distfn.random_state.get_state(), orig_state) + + # finally, restore the random_state + distfn.random_state = rndm + + +def check_meth_dtype(distfn, arg, meths): + q0 = [0.25, 0.5, 0.75] + x0 = distfn.ppf(q0, *arg) + x_cast = [x0.astype(tp) for tp in + (np.int_, np.float16, np.float32, np.float64)] + + for x in x_cast: + # casting may have clipped the values, exclude those + distfn._argcheck(*arg) + x = x[(distfn.a < x) & (x < distfn.b)] + for meth in meths: + val = meth(x, *arg) + npt.assert_(val.dtype == np.float_) + + +def check_ppf_dtype(distfn, arg): + q0 = np.asarray([0.25, 0.5, 0.75]) + q_cast = [q0.astype(tp) for tp in (np.float16, np.float32, np.float64)] + for q in q_cast: + for meth in [distfn.ppf, distfn.isf]: + val = meth(q, *arg) + npt.assert_(val.dtype == np.float_) + + +def check_cmplx_deriv(distfn, arg): + # Distributions allow complex arguments. + def deriv(f, x, *arg): + x = np.asarray(x) + h = 1e-10 + return (f(x + h*1j, *arg)/h).imag + + x0 = distfn.ppf([0.25, 0.51, 0.75], *arg) + x_cast = [x0.astype(tp) for tp in + (np.int_, np.float16, np.float32, np.float64)] + + for x in x_cast: + # casting may have clipped the values, exclude those + distfn._argcheck(*arg) + x = x[(distfn.a < x) & (x < distfn.b)] + + pdf, cdf, sf = distfn.pdf(x, *arg), distfn.cdf(x, *arg), distfn.sf(x, *arg) + assert_allclose(deriv(distfn.cdf, x, *arg), pdf, rtol=1e-5) + assert_allclose(deriv(distfn.logcdf, x, *arg), pdf/cdf, rtol=1e-5) + + assert_allclose(deriv(distfn.sf, x, *arg), -pdf, rtol=1e-5) + assert_allclose(deriv(distfn.logsf, x, *arg), -pdf/sf, rtol=1e-5) + + assert_allclose(deriv(distfn.logpdf, x, *arg), + deriv(distfn.pdf, x, *arg) / distfn.pdf(x, *arg), + rtol=1e-5) + + +def check_pickling(distfn, args): + # check that a distribution instance pickles and unpickles + # pay special attention to the random_state property + + # save the random_state (restore later) + rndm = distfn.random_state + + distfn.random_state = 1234 + distfn.rvs(*args, size=8) + s = pickle.dumps(distfn) + r0 = distfn.rvs(*args, size=8) + + unpickled = pickle.loads(s) + r1 = unpickled.rvs(*args, size=8) + npt.assert_equal(r0, r1) + + # also smoke test some methods + medians = [distfn.ppf(0.5, *args), unpickled.ppf(0.5, *args)] + npt.assert_equal(medians[0], medians[1]) + npt.assert_equal(distfn.cdf(medians[0], *args), + unpickled.cdf(medians[1], *args)) + + # restore the random_state + distfn.random_state = rndm diff --git a/wafo/stats/tests/test_binned_statistic.py b/wafo/stats/tests/test_binned_statistic.py deleted file mode 100644 index 26cc4be..0000000 --- a/wafo/stats/tests/test_binned_statistic.py +++ /dev/null @@ -1,238 +0,0 @@ -from __future__ import division, print_function, absolute_import - -import numpy as np -from numpy.testing import assert_array_almost_equal, run_module_suite -from scipy.stats import \ - binned_statistic, binned_statistic_2d, binned_statistic_dd - - -class TestBinnedStatistic(object): - - @classmethod - def setup_class(cls): - np.random.seed(9865) - cls.x = np.random.random(100) - cls.y = np.random.random(100) - cls.v = np.random.random(100) - cls.X = np.random.random((100, 3)) - - def test_1d_count(self): - x = self.x - v = self.v - - count1, edges1, bc = binned_statistic(x, v, 'count', bins=10) - count2, edges2 = np.histogram(x, bins=10) - - assert_array_almost_equal(count1, count2) - assert_array_almost_equal(edges1, edges2) - - def test_1d_sum(self): - x = self.x - v = self.v - - sum1, edges1, bc = binned_statistic(x, v, 'sum', bins=10) - sum2, edges2 = np.histogram(x, bins=10, weights=v) - - assert_array_almost_equal(sum1, sum2) - assert_array_almost_equal(edges1, edges2) - - def test_1d_mean(self): - x = self.x - v = self.v - - stat1, edges1, bc = binned_statistic(x, v, 'mean', bins=10) - stat2, edges2, bc = binned_statistic(x, v, np.mean, bins=10) - - assert_array_almost_equal(stat1, stat2) - assert_array_almost_equal(edges1, edges2) - - def test_1d_std(self): - x = self.x - v = self.v - - stat1, edges1, bc = binned_statistic(x, v, 'std', bins=10) - stat2, edges2, bc = binned_statistic(x, v, np.std, bins=10) - - assert_array_almost_equal(stat1, stat2) - assert_array_almost_equal(edges1, edges2) - - def test_1d_median(self): - x = self.x - v = self.v - - stat1, edges1, bc = binned_statistic(x, v, 'median', bins=10) - stat2, edges2, bc = binned_statistic(x, v, np.median, bins=10) - - assert_array_almost_equal(stat1, stat2) - assert_array_almost_equal(edges1, edges2) - - def test_1d_bincode(self): - x = self.x[:20] - v = self.v[:20] - - count1, edges1, bc = binned_statistic(x, v, 'count', bins=3) - bc2 = np.array([3, 2, 1, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 1, 2, 3, 1, - 1, 2, 1]) - - bcount = [(bc == i).sum() for i in np.unique(bc)] - - assert_array_almost_equal(bc, bc2) - assert_array_almost_equal(bcount, count1) - - def test_1d_range_keyword(self): - # Regression test for gh-3063, range can be (min, max) or [(min, max)] - np.random.seed(9865) - x = np.arange(30) - data = np.random.random(30) - - mean, bins, _ = binned_statistic(x[:15], data[:15]) - mean_range, bins_range, _ = binned_statistic(x, data, range=[(0, 14)]) - mean_range2, bins_range2, _ = binned_statistic(x, data, range=(0, 14)) - - assert_array_almost_equal(mean, mean_range) - assert_array_almost_equal(bins, bins_range) - assert_array_almost_equal(mean, mean_range2) - assert_array_almost_equal(bins, bins_range2) - - def test_2d_count(self): - x = self.x - y = self.y - v = self.v - - count1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'count', bins=5) - count2, binx2, biny2 = np.histogram2d(x, y, bins=5) - - assert_array_almost_equal(count1, count2) - assert_array_almost_equal(binx1, binx2) - assert_array_almost_equal(biny1, biny2) - - def test_2d_sum(self): - x = self.x - y = self.y - v = self.v - - sum1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'sum', bins=5) - sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v) - - assert_array_almost_equal(sum1, sum2) - assert_array_almost_equal(binx1, binx2) - assert_array_almost_equal(biny1, biny2) - - def test_2d_mean(self): - x = self.x - y = self.y - v = self.v - - stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'mean', bins=5) - stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5) - - assert_array_almost_equal(stat1, stat2) - assert_array_almost_equal(binx1, binx2) - assert_array_almost_equal(biny1, biny2) - - def test_2d_std(self): - x = self.x - y = self.y - v = self.v - - stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'std', bins=5) - stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.std, bins=5) - - assert_array_almost_equal(stat1, stat2) - assert_array_almost_equal(binx1, binx2) - assert_array_almost_equal(biny1, biny2) - - def test_2d_median(self): - x = self.x - y = self.y - v = self.v - - stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'median', bins=5) - stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.median, bins=5) - - assert_array_almost_equal(stat1, stat2) - assert_array_almost_equal(binx1, binx2) - assert_array_almost_equal(biny1, biny2) - - def test_2d_bincode(self): - x = self.x[:20] - y = self.y[:20] - v = self.v[:20] - - count1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'count', bins=3) - bc2 = np.array([17, 11, 6, 16, 11, 17, 18, 17, 17, 7, 6, 18, 16, - 6, 11, 16, 6, 6, 11, 8]) - - bcount = [(bc == i).sum() for i in np.unique(bc)] - - assert_array_almost_equal(bc, bc2) - count1adj = count1[count1.nonzero()] - assert_array_almost_equal(bcount, count1adj) - - def test_dd_count(self): - X = self.X - v = self.v - - count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3) - count2, edges2 = np.histogramdd(X, bins=3) - - assert_array_almost_equal(count1, count2) - assert_array_almost_equal(edges1, edges2) - - def test_dd_sum(self): - X = self.X - v = self.v - - sum1, edges1, bc = binned_statistic_dd(X, v, 'sum', bins=3) - sum2, edges2 = np.histogramdd(X, bins=3, weights=v) - - assert_array_almost_equal(sum1, sum2) - assert_array_almost_equal(edges1, edges2) - - def test_dd_mean(self): - X = self.X - v = self.v - - stat1, edges1, bc = binned_statistic_dd(X, v, 'mean', bins=3) - stat2, edges2, bc = binned_statistic_dd(X, v, np.mean, bins=3) - - assert_array_almost_equal(stat1, stat2) - assert_array_almost_equal(edges1, edges2) - - def test_dd_std(self): - X = self.X - v = self.v - - stat1, edges1, bc = binned_statistic_dd(X, v, 'std', bins=3) - stat2, edges2, bc = binned_statistic_dd(X, v, np.std, bins=3) - - assert_array_almost_equal(stat1, stat2) - assert_array_almost_equal(edges1, edges2) - - def test_dd_median(self): - X = self.X - v = self.v - - stat1, edges1, bc = binned_statistic_dd(X, v, 'median', bins=3) - stat2, edges2, bc = binned_statistic_dd(X, v, np.median, bins=3) - - assert_array_almost_equal(stat1, stat2) - assert_array_almost_equal(edges1, edges2) - - def test_dd_bincode(self): - X = self.X[:20] - v = self.v[:20] - - count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3) - bc2 = np.array([63, 33, 86, 83, 88, 67, 57, 33, 42, 41, 82, 83, 92, - 32, 36, 91, 43, 87, 81, 81]) - - bcount = [(bc == i).sum() for i in np.unique(bc)] - - assert_array_almost_equal(bc, bc2) - count1adj = count1[count1.nonzero()] - assert_array_almost_equal(bcount, count1adj) - - -if __name__ == "__main__": - run_module_suite() diff --git a/wafo/stats/tests/test_contingency.py b/wafo/stats/tests/test_contingency.py deleted file mode 100644 index 23eee17..0000000 --- a/wafo/stats/tests/test_contingency.py +++ /dev/null @@ -1,202 +0,0 @@ -from __future__ import division, print_function, absolute_import - -import numpy as np -from numpy.testing import (run_module_suite, assert_equal, assert_array_equal, - assert_array_almost_equal, assert_approx_equal, assert_raises, - assert_allclose) -from scipy.special import xlogy -from scipy.stats.contingency import margins, expected_freq, chi2_contingency - - -def test_margins(): - a = np.array([1]) - m = margins(a) - assert_equal(len(m), 1) - m0 = m[0] - assert_array_equal(m0, np.array([1])) - - a = np.array([[1]]) - m0, m1 = margins(a) - expected0 = np.array([[1]]) - expected1 = np.array([[1]]) - assert_array_equal(m0, expected0) - assert_array_equal(m1, expected1) - - a = np.arange(12).reshape(2, 6) - m0, m1 = margins(a) - expected0 = np.array([[15], [51]]) - expected1 = np.array([[6, 8, 10, 12, 14, 16]]) - assert_array_equal(m0, expected0) - assert_array_equal(m1, expected1) - - a = np.arange(24).reshape(2, 3, 4) - m0, m1, m2 = margins(a) - expected0 = np.array([[[66]], [[210]]]) - expected1 = np.array([[[60], [92], [124]]]) - expected2 = np.array([[[60, 66, 72, 78]]]) - assert_array_equal(m0, expected0) - assert_array_equal(m1, expected1) - assert_array_equal(m2, expected2) - - -def test_expected_freq(): - assert_array_equal(expected_freq([1]), np.array([1.0])) - - observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]]) - e = expected_freq(observed) - assert_array_equal(e, np.ones_like(observed)) - - observed = np.array([[10, 10, 20], [20, 20, 20]]) - e = expected_freq(observed) - correct = np.array([[12., 12., 16.], [18., 18., 24.]]) - assert_array_almost_equal(e, correct) - - -def test_chi2_contingency_trivial(): - # Some very simple tests for chi2_contingency. - - # A trivial case - obs = np.array([[1, 2], [1, 2]]) - chi2, p, dof, expected = chi2_contingency(obs, correction=False) - assert_equal(chi2, 0.0) - assert_equal(p, 1.0) - assert_equal(dof, 1) - assert_array_equal(obs, expected) - - # A *really* trivial case: 1-D data. - obs = np.array([1, 2, 3]) - chi2, p, dof, expected = chi2_contingency(obs, correction=False) - assert_equal(chi2, 0.0) - assert_equal(p, 1.0) - assert_equal(dof, 0) - assert_array_equal(obs, expected) - - -def test_chi2_contingency_R(): - # Some test cases that were computed independently, using R. - - Rcode = \ - """ - # Data vector. - data <- c( - 12, 34, 23, 4, 47, 11, - 35, 31, 11, 34, 10, 18, - 12, 32, 9, 18, 13, 19, - 12, 12, 14, 9, 33, 25 - ) - - # Create factor tags:r=rows, c=columns, t=tiers - r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4"))) - c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3"))) - t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2"))) - - # 3-way Chi squared test of independence - s = summary(xtabs(data~r+c+t)) - print(s) - """ - Routput = \ - """ - Call: xtabs(formula = data ~ r + c + t) - Number of cases in table: 478 - Number of factors: 3 - Test for independence of all factors: - Chisq = 102.17, df = 17, p-value = 3.514e-14 - """ - obs = np.array( - [[[12, 34, 23], - [35, 31, 11], - [12, 32, 9], - [12, 12, 14]], - [[4, 47, 11], - [34, 10, 18], - [18, 13, 19], - [9, 33, 25]]]) - chi2, p, dof, expected = chi2_contingency(obs) - assert_approx_equal(chi2, 102.17, significant=5) - assert_approx_equal(p, 3.514e-14, significant=4) - assert_equal(dof, 17) - - Rcode = \ - """ - # Data vector. - data <- c( - # - 12, 17, - 11, 16, - # - 11, 12, - 15, 16, - # - 23, 15, - 30, 22, - # - 14, 17, - 15, 16 - ) - - # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers - r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2"))) - c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2"))) - d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2"))) - t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2"))) - - # 4-way Chi squared test of independence - s = summary(xtabs(data~r+c+d+t)) - print(s) - """ - Routput = \ - """ - Call: xtabs(formula = data ~ r + c + d + t) - Number of cases in table: 262 - Number of factors: 4 - Test for independence of all factors: - Chisq = 8.758, df = 11, p-value = 0.6442 - """ - obs = np.array( - [[[[12, 17], - [11, 16]], - [[11, 12], - [15, 16]]], - [[[23, 15], - [30, 22]], - [[14, 17], - [15, 16]]]]) - chi2, p, dof, expected = chi2_contingency(obs) - assert_approx_equal(chi2, 8.758, significant=4) - assert_approx_equal(p, 0.6442, significant=4) - assert_equal(dof, 11) - - -def test_chi2_contingency_g(): - c = np.array([[15, 60], [15, 90]]) - g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False) - assert_allclose(g, 2*xlogy(c, c/e).sum()) - - g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True) - c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]]) - assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum()) - - c = np.array([[10, 12, 10], [12, 10, 10]]) - g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood') - assert_allclose(g, 2*xlogy(c, c/e).sum()) - - -def test_chi2_contingency_bad_args(): - # Test that "bad" inputs raise a ValueError. - - # Negative value in the array of observed frequencies. - obs = np.array([[-1, 10], [1, 2]]) - assert_raises(ValueError, chi2_contingency, obs) - - # The zeros in this will result in zeros in the array - # of expected frequencies. - obs = np.array([[0, 1], [0, 1]]) - assert_raises(ValueError, chi2_contingency, obs) - - # A degenerate case: `observed` has size 0. - obs = np.empty((0, 8)) - assert_raises(ValueError, chi2_contingency, obs) - - -if __name__ == "__main__": - run_module_suite() diff --git a/wafo/stats/tests/test_continuous_basic.py b/wafo/stats/tests/test_continuous_basic.py index 9a4d662..5d2ebd8 100644 --- a/wafo/stats/tests/test_continuous_basic.py +++ b/wafo/stats/tests/test_continuous_basic.py @@ -7,11 +7,16 @@ import numpy.testing as npt from scipy import integrate from wafo import stats -from wafo.stats.tests.common_tests import (check_normalization, check_moment, - check_mean_expect, - check_var_expect, check_skew_expect, check_kurt_expect, - check_entropy, check_private_entropy, NUMPY_BELOW_1_7, - check_edge_support, check_named_args) + +from wafo.stats.tests.common_tests import (check_normalization, check_moment, check_mean_expect, + check_var_expect, check_skew_expect, + check_kurt_expect, check_entropy, + check_private_entropy, + check_edge_support, check_named_args, + check_random_state_property, + check_meth_dtype, check_ppf_dtype, check_cmplx_deriv, + check_pickling) + from wafo.stats._distr_params import distcont @@ -26,9 +31,12 @@ These tests currently check only/mostly for serious errors and exceptions, not for numerically exact results. """ +# Note that you need to add new distributions you want tested +# to _distr_params + DECIMAL = 5 # specify the precision of the tests # increased from 0 to 5 -## Last four of these fail all around. Need to be checked +# Last four of these fail all around. Need to be checked distcont_extra = [ ['betaprime', (100, 86)], ['fatiguelife', (5,)], @@ -41,58 +49,37 @@ distcont_extra = [ ] -# for testing only specific functions -# distcont = [ -## ['fatiguelife', (29,)], #correction numargs = 1 -## ['loggamma', (0.41411931826052117,)]] - -# for testing ticket:767 -# distcont = [ -## ['genextreme', (3.3184017469423535,)], -## ['genextreme', (0.01,)], -## ['genextreme', (0.00001,)], -## ['genextreme', (0.0,)], -## ['genextreme', (-0.01,)] -## ] - -# distcont = [['gumbel_l', ()], -## ['gumbel_r', ()], -## ['norm', ()] -## ] - -# distcont = [['norm', ()]] - -distmissing = ['wald', 'gausshyper', 'genexpon', 'rv_continuous', - 'loglaplace', 'rdist', 'semicircular', 'invweibull', 'ksone', - 'cosine', 'kstwobign', 'truncnorm', 'mielke', 'recipinvgauss', 'levy', - 'johnsonsu', 'levy_l', 'powernorm', 'wrapcauchy', - 'johnsonsb', 'truncexpon', 'rice', 'invgauss', 'invgamma', - 'powerlognorm'] - -distmiss = [[dist,args] for dist,args in distcont if dist in distmissing] distslow = ['rdist', 'gausshyper', 'recipinvgauss', 'ksone', 'genexpon', 'vonmises', 'vonmises_line', 'mielke', 'semicircular', 'cosine', 'invweibull', 'powerlognorm', 'johnsonsu', 'kstwobign'] # distslow are sorted by speed (very slow to slow) -# NB: not needed anymore? -def _silence_fp_errors(func): - # warning: don't apply to test_ functions as is, then those will be skipped - def wrap(*a, **kw): - olderr = np.seterr(all='ignore') - try: - return func(*a, **kw) - finally: - np.seterr(**olderr) - wrap.__name__ = func.__name__ - return wrap +# These distributions fail the complex derivative test below. +# Here 'fail' mean produce wrong results and/or raise exceptions, depending +# on the implementation details of corresponding special functions. +# cf https://github.com/scipy/scipy/pull/4979 for a discussion. +fails_cmplx = set(['alpha', 'beta', 'betaprime', 'burr12', 'chi', 'chi2', 'dgamma', + 'dweibull', 'erlang', 'expon', 'exponnorm', 'exponpow', + 'exponweib', 'f', 'fatiguelife', 'foldnorm', 'frechet_l', + 'frechet_r', 'gamma', 'gausshyper', 'genexpon', + 'genextreme', 'gengamma', 'genlogistic', 'gennorm', + 'genpareto', 'gilbrat', 'gompertz', 'halfcauchy', + 'halfgennorm', 'halflogistic', 'halfnorm', 'invgamma', + 'invgauss', 'johnsonsb', 'johnsonsu', 'ksone', 'kstwobign', + 'levy_l', 'loggamma', 'logistic', 'lognorm', 'lomax', + 'maxwell', 'nakagami', 'ncf', 'nct', 'ncx2', 'norm', + 'pearson3', 'powerlognorm', 'powernorm', 'rayleigh', + 'recipinvgauss', 'rice', 'skewnorm', 't', 'truncexpon', 'truncnorm', + 'tukeylambda', 'vonmises', 'vonmises_line', 'wald', + 'weibull_min']) def test_cont_basic(): # this test skips slow distributions with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=integrate.IntegrationWarning) + warnings.filterwarnings('ignore', + category=integrate.IntegrationWarning) for distname, arg in distcont[:]: if distname in distslow: continue @@ -106,17 +93,17 @@ def test_cont_basic(): sv = rvs.var() m, v = distfn.stats(*arg) - yield check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn, \ - distname + 'sample mean test' + yield (check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn, + distname + 'sample mean test') yield check_cdf_ppf, distfn, arg, distname yield check_sf_isf, distfn, arg, distname yield check_pdf, distfn, arg, distname yield check_pdf_logpdf, distfn, arg, distname yield check_cdf_logcdf, distfn, arg, distname yield check_sf_logsf, distfn, arg, distname - if distname in distmissing: - alpha = 0.01 - yield check_distribution_rvs, distname, arg, alpha, rvs + + alpha = 0.01 + yield check_distribution_rvs, distname, arg, alpha, rvs locscale_defaults = (0, 1) meths = [distfn.pdf, distfn.logpdf, distfn.cdf, distfn.logcdf, @@ -126,28 +113,35 @@ def test_cont_basic(): 'pareto': 1.5, 'tukeylambda': 0.3} x = spec_x.get(distname, 0.5) yield check_named_args, distfn, x, arg, locscale_defaults, meths + yield check_random_state_property, distfn, arg + # yield check_pickling, distfn, arg # Entropy skp = npt.dec.skipif yield check_entropy, distfn, arg, distname if distfn.numargs == 0: - yield skp(NUMPY_BELOW_1_7)(check_vecentropy), distfn, arg + yield check_vecentropy, distfn, arg if distfn.__class__._entropy != stats.rv_continuous._entropy: yield check_private_entropy, distfn, arg, stats.rv_continuous yield check_edge_support, distfn, arg + yield check_meth_dtype, distfn, arg, meths + yield check_ppf_dtype, distfn, arg + yield skp(distname in fails_cmplx)(check_cmplx_deriv), distfn, arg + knf = npt.dec.knownfailureif - yield knf(distname == 'truncnorm')(check_ppf_private), distfn, \ - arg, distname + yield (knf(distname == 'truncnorm')(check_ppf_private), distfn, + arg, distname) @npt.dec.slow def test_cont_basic_slow(): # same as above for slow distributions with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=integrate.IntegrationWarning) + warnings.filterwarnings('ignore', + category=integrate.IntegrationWarning) for distname, arg in distcont[:]: if distname not in distslow: continue @@ -156,12 +150,12 @@ def test_cont_basic_slow(): distfn = getattr(stats, distname) np.random.seed(765456) sn = 500 - rvs = distfn.rvs(size=sn,*arg) + rvs = distfn.rvs(size=sn, *arg) sm = rvs.mean() sv = rvs.var() m, v = distfn.stats(*arg) - yield check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn, \ - distname + 'sample mean test' + yield (check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn, + distname + 'sample mean test') yield check_cdf_ppf, distfn, arg, distname yield check_sf_isf, distfn, arg, distname yield check_pdf, distfn, arg, distname @@ -169,9 +163,9 @@ def test_cont_basic_slow(): yield check_cdf_logcdf, distfn, arg, distname yield check_sf_logsf, distfn, arg, distname # yield check_oth, distfn, arg # is still missing - if distname in distmissing: - alpha = 0.01 - yield check_distribution_rvs, distname, arg, alpha, rvs + + alpha = 0.01 + yield check_distribution_rvs, distname, arg, alpha, rvs locscale_defaults = (0, 1) meths = [distfn.pdf, distfn.logpdf, distfn.cdf, distfn.logcdf, @@ -183,6 +177,8 @@ def test_cont_basic_slow(): elif distname == 'ksone': arg = (3,) yield check_named_args, distfn, x, arg, locscale_defaults, meths + yield check_random_state_property, distfn, arg + # yield check_pickling, distfn, arg # Entropy skp = npt.dec.skipif @@ -190,17 +186,22 @@ def test_cont_basic_slow(): yield skp(ks_cond)(check_entropy), distfn, arg, distname if distfn.numargs == 0: - yield skp(NUMPY_BELOW_1_7)(check_vecentropy), distfn, arg + yield check_vecentropy, distfn, arg if distfn.__class__._entropy != stats.rv_continuous._entropy: yield check_private_entropy, distfn, arg, stats.rv_continuous yield check_edge_support, distfn, arg + yield check_meth_dtype, distfn, arg, meths + yield check_ppf_dtype, distfn, arg + yield skp(distname in fails_cmplx)(check_cmplx_deriv), distfn, arg + @npt.dec.slow def test_moments(): with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=integrate.IntegrationWarning) + warnings.filterwarnings('ignore', + category=integrate.IntegrationWarning) knf = npt.dec.knownfailureif fail_normalization = set(['vonmises', 'ksone']) fail_higher = set(['vonmises', 'ksone', 'ncf']) @@ -209,28 +210,30 @@ def test_moments(): continue distfn = getattr(stats, distname) m, v, s, k = distfn.stats(*arg, moments='mvsk') - cond1, cond2 = distname in fail_normalization, distname in fail_higher + cond1 = distname in fail_normalization + cond2 = distname in fail_higher msg = distname + ' fails moments' yield knf(cond1, msg)(check_normalization), distfn, arg, distname yield knf(cond2, msg)(check_mean_expect), distfn, arg, m, distname - yield knf(cond2, msg)(check_var_expect), distfn, arg, m, v, distname - yield knf(cond2, msg)(check_skew_expect), distfn, arg, m, v, s, \ - distname - yield knf(cond2, msg)(check_kurt_expect), distfn, arg, m, v, k, \ - distname + yield (knf(cond2, msg)(check_var_expect), distfn, arg, m, v, + distname) + yield (knf(cond2, msg)(check_skew_expect), distfn, arg, m, v, s, + distname) + yield (knf(cond2, msg)(check_kurt_expect), distfn, arg, m, v, k, + distname) yield check_loc_scale, distfn, arg, m, v, distname yield check_moment, distfn, arg, m, v, distname def check_sample_meanvar_(distfn, arg, m, v, sm, sv, sn, msg): # this did not work, skipped silently by nose - if not np.isinf(m): + if np.isfinite(m): check_sample_mean(sm, sv, sn, m) - if not np.isinf(v): + if np.isfinite(v): check_sample_var(sv, sn, v) -def check_sample_mean(sm,v,n, popmean): +def check_sample_mean(sm, v, n, popmean): # from stats.stats.ttest_1samp(a, popmean): # Calculates the t-obtained for the independent samples T-test on ONE group # of scores a, given a population mean. @@ -243,31 +246,32 @@ def check_sample_mean(sm,v,n, popmean): # return t,prob npt.assert_(prob > 0.01, 'mean fail, t,prob = %f, %f, m, sm=%f,%f' % - (t, prob, popmean, sm)) + (t, prob, popmean, sm)) -def check_sample_var(sv,n, popvar): - # two-sided chisquare test for sample variance equal to hypothesized variance +def check_sample_var(sv, n, popvar): + # two-sided chisquare test for sample variance equal to + # hypothesized variance df = n-1 chi2 = (n-1)*popvar/float(popvar) - pval = stats.chisqprob(chi2,df)*2 + pval = stats.distributions.chi2.sf(chi2, df) * 2 npt.assert_(pval > 0.01, 'var fail, t, pval = %f, %f, v, sv=%f, %f' % - (chi2,pval,popvar,sv)) + (chi2, pval, popvar, sv)) -def check_cdf_ppf(distfn,arg,msg): +def check_cdf_ppf(distfn, arg, msg): values = [0.001, 0.5, 0.999] npt.assert_almost_equal(distfn.cdf(distfn.ppf(values, *arg), *arg), values, decimal=DECIMAL, err_msg=msg + ' - cdf-ppf roundtrip') -def check_sf_isf(distfn,arg,msg): - npt.assert_almost_equal(distfn.sf(distfn.isf([0.1,0.5,0.9], *arg), *arg), - [0.1,0.5,0.9], decimal=DECIMAL, err_msg=msg + +def check_sf_isf(distfn, arg, msg): + npt.assert_almost_equal(distfn.sf(distfn.isf([0.1, 0.5, 0.9], *arg), *arg), + [0.1, 0.5, 0.9], decimal=DECIMAL, err_msg=msg + ' - sf-isf roundtrip') - npt.assert_almost_equal(distfn.cdf([0.1,0.9], *arg), - 1.0-distfn.sf([0.1,0.9], *arg), + npt.assert_almost_equal(distfn.cdf([0.1, 0.9], *arg), + 1.0 - distfn.sf([0.1, 0.9], *arg), decimal=DECIMAL, err_msg=msg + ' - cdf-sf relationship') @@ -278,15 +282,16 @@ def check_pdf(distfn, arg, msg): eps = 1e-6 pdfv = distfn.pdf(median, *arg) if (pdfv < 1e-4) or (pdfv > 1e4): - # avoid checking a case where pdf is close to zero or huge (singularity) + # avoid checking a case where pdf is close to zero or + # huge (singularity) median = median + 0.1 pdfv = distfn.pdf(median, *arg) cdfdiff = (distfn.cdf(median + eps, *arg) - distfn.cdf(median - eps, *arg))/eps/2.0 # replace with better diff and better test (more points), # actually, this works pretty well - npt.assert_almost_equal(pdfv, cdfdiff, - decimal=DECIMAL, err_msg=msg + ' - cdf-pdf relationship') + msg += ' - cdf-pdf relationship' + npt.assert_almost_equal(pdfv, cdfdiff, decimal=DECIMAL, err_msg=msg) def check_pdf_logpdf(distfn, args, msg): @@ -297,7 +302,8 @@ def check_pdf_logpdf(distfn, args, msg): logpdf = distfn.logpdf(vals, *args) pdf = pdf[pdf != 0] logpdf = logpdf[np.isfinite(logpdf)] - npt.assert_almost_equal(np.log(pdf), logpdf, decimal=7, err_msg=msg + " - logpdf-log(pdf) relationship") + msg += " - logpdf-log(pdf) relationship" + npt.assert_almost_equal(np.log(pdf), logpdf, decimal=7, err_msg=msg) def check_sf_logsf(distfn, args, msg): @@ -308,7 +314,8 @@ def check_sf_logsf(distfn, args, msg): logsf = distfn.logsf(vals, *args) sf = sf[sf != 0] logsf = logsf[np.isfinite(logsf)] - npt.assert_almost_equal(np.log(sf), logsf, decimal=7, err_msg=msg + " - logsf-log(sf) relationship") + msg += " - logsf-log(sf) relationship" + npt.assert_almost_equal(np.log(sf), logsf, decimal=7, err_msg=msg) def check_cdf_logcdf(distfn, args, msg): @@ -319,24 +326,24 @@ def check_cdf_logcdf(distfn, args, msg): logcdf = distfn.logcdf(vals, *args) cdf = cdf[cdf != 0] logcdf = logcdf[np.isfinite(logcdf)] - npt.assert_almost_equal(np.log(cdf), logcdf, decimal=7, err_msg=msg + " - logcdf-log(cdf) relationship") + msg += " - logcdf-log(cdf) relationship" + npt.assert_almost_equal(np.log(cdf), logcdf, decimal=7, err_msg=msg) def check_distribution_rvs(dist, args, alpha, rvs): # test from scipy.stats.tests # this version reuses existing random variables - D,pval = stats.kstest(rvs, dist, args=args, N=1000) + D, pval = stats.kstest(rvs, dist, args=args, N=1000) if (pval < alpha): - D,pval = stats.kstest(dist,'',args=args, N=1000) + D, pval = stats.kstest(dist, '', args=args, N=1000) npt.assert_(pval > alpha, "D = " + str(D) + "; pval = " + str(pval) + - "; alpha = " + str(alpha) + "\nargs = " + str(args)) + "; alpha = " + str(alpha) + "\nargs = " + str(args)) def check_vecentropy(distfn, args): npt.assert_equal(distfn.vecentropy(*args), distfn._entropy(*args)) -@npt.dec.skipif(NUMPY_BELOW_1_7) def check_loc_scale(distfn, arg, m, v, msg): loc, scale = 10.0, 10.0 mt, vt = distfn.stats(loc=loc, scale=scale, *arg) @@ -345,7 +352,7 @@ def check_loc_scale(distfn, arg, m, v, msg): def check_ppf_private(distfn, arg, msg): - #fails by design for truncnorm self.nb not defined + # fails by design for truncnorm self.nb not defined ppfs = distfn._ppf(np.array([0.1, 0.5, 0.9]), *arg) npt.assert_(not np.any(np.isnan(ppfs)), msg + 'ppf private is nan') diff --git a/wafo/stats/tests/test_discrete_basic.py b/wafo/stats/tests/test_discrete_basic.py index 942a76f..9f4169f 100644 --- a/wafo/stats/tests/test_discrete_basic.py +++ b/wafo/stats/tests/test_discrete_basic.py @@ -5,18 +5,26 @@ import numpy as np from scipy._lib.six import xrange from wafo import stats -from wafo.stats.tests.common_tests import (check_normalization, check_moment, - check_mean_expect, - check_var_expect, check_skew_expect, check_kurt_expect, - check_entropy, check_private_entropy, check_edge_support, - check_named_args) +from wafo.stats.tests.common_tests import (check_normalization, check_moment, check_mean_expect, + check_var_expect, check_skew_expect, + check_kurt_expect, check_entropy, + check_private_entropy, check_edge_support, + check_named_args, check_random_state_property, + check_pickling) from wafo.stats._distr_params import distdiscrete knf = npt.dec.knownfailureif +vals = ([1, 2, 3, 4], [0.1, 0.2, 0.3, 0.4]) +distdiscrete += [[stats.rv_discrete(values=vals), ()]] + def test_discrete_basic(): for distname, arg in distdiscrete: - distfn = getattr(stats, distname) + try: + distfn = getattr(stats, distname) + except TypeError: + distfn = distname + distname = 'sample distribution' np.random.seed(9765456) rvs = distfn.rvs(size=2000, *arg) supp = np.unique(rvs) @@ -28,15 +36,19 @@ def test_discrete_basic(): yield check_edge_support, distfn, arg alpha = 0.01 - yield check_discrete_chisquare, distfn, arg, rvs, alpha, \ - distname + ' chisquare' + yield (check_discrete_chisquare, distfn, arg, rvs, alpha, + distname + ' chisquare') seen = set() for distname, arg in distdiscrete: if distname in seen: continue seen.add(distname) - distfn = getattr(stats,distname) + try: + distfn = getattr(stats, distname) + except TypeError: + distfn = distname + distname = 'sample distribution' locscale_defaults = (0,) meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf, distfn.logsf] @@ -44,7 +56,10 @@ def test_discrete_basic(): spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0, } k = spec_k.get(distname, 1) yield check_named_args, distfn, k, arg, locscale_defaults, meths - yield check_scale_docstring, distfn + if distname != 'sample distribution': + yield check_scale_docstring, distfn + yield check_random_state_property, distfn, arg + yield check_pickling, distfn, arg # Entropy yield check_entropy, distfn, arg, distname @@ -54,7 +69,11 @@ def test_discrete_basic(): def test_moments(): for distname, arg in distdiscrete: - distfn = getattr(stats,distname) + try: + distfn = getattr(stats, distname) + except TypeError: + distfn = distname + distname = 'sample distribution' m, v, s, k = distfn.stats(*arg, moments='mvsk') yield check_normalization, distfn, arg, distname @@ -64,7 +83,7 @@ def test_moments(): yield check_var_expect, distfn, arg, m, v, distname yield check_skew_expect, distfn, arg, m, v, s, distname - cond = False #distname in ['zipf'] + cond = distname in ['zipf'] msg = distname + ' fails kurtosis' yield knf(cond, msg)(check_kurt_expect), distfn, arg, m, v, k, distname @@ -81,35 +100,36 @@ def check_cdf_ppf(distfn, arg, supp, msg): supp, msg + '-roundtrip') supp1 = supp[supp < distfn.b] npt.assert_array_equal(distfn.ppf(distfn.cdf(supp1, *arg) + 1e-8, *arg), - supp1 + distfn.inc, msg + 'ppf-cdf-next') + supp1 + distfn.inc, msg + 'ppf-cdf-next') # -1e-8 could cause an error if pmf < 1e-8 def check_pmf_cdf(distfn, arg, distname): - startind = np.int(distfn.ppf(0.01, *arg) - 1) + startind = int(distfn.ppf(0.01, *arg) - 1) index = list(range(startind, startind + 10)) - cdfs, pmfs_cum = distfn.cdf(index,*arg), distfn.pmf(index, *arg).cumsum() + cdfs = distfn.cdf(index, *arg) + pmfs_cum = distfn.pmf(index, *arg).cumsum() atol, rtol = 1e-10, 1e-10 if distname == 'skellam': # ncx2 accuracy atol, rtol = 1e-5, 1e-5 npt.assert_allclose(cdfs - cdfs[0], pmfs_cum - pmfs_cum[0], - atol=atol, rtol=rtol) + atol=atol, rtol=rtol) def check_moment_frozen(distfn, arg, m, k): npt.assert_allclose(distfn(*arg).moment(k), m, - atol=1e-10, rtol=1e-10) + atol=1e-10, rtol=1e-10) def check_oth(distfn, arg, supp, msg): # checking other methods of distfn npt.assert_allclose(distfn.sf(supp, *arg), 1. - distfn.cdf(supp, *arg), - atol=1e-10, rtol=1e-10) + atol=1e-10, rtol=1e-10) q = np.linspace(0.01, 0.99, 20) npt.assert_allclose(distfn.isf(q, *arg), distfn.ppf(1. - q, *arg), - atol=1e-10, rtol=1e-10) + atol=1e-10, rtol=1e-10) median_sf = distfn.isf(0.5, *arg) npt.assert_(distfn.sf(median_sf - 1, *arg) > 0.5) @@ -133,44 +153,41 @@ def check_discrete_chisquare(distfn, arg, rvs, alpha, msg): result : bool 0 if test passes, 1 if test fails - uses global variable debug for printing results - """ - n = len(rvs) - nsupp = 20 - wsupp = 1.0/nsupp + wsupp = 0.05 - # construct intervals with minimum mass 1/nsupp + # construct intervals with minimum mass `wsupp`. # intervals are left-half-open as in a cdf difference - distsupport = xrange(max(distfn.a, -1000), min(distfn.b, 1000) + 1) + lo = max(distfn.a, -1000) + distsupport = xrange(lo, min(distfn.b, 1000) + 1) last = 0 - distsupp = [max(distfn.a, -1000)] + distsupp = [lo] distmass = [] for ii in distsupport: - current = distfn.cdf(ii,*arg) - if current - last >= wsupp-1e-14: + current = distfn.cdf(ii, *arg) + if current - last >= wsupp - 1e-14: distsupp.append(ii) distmass.append(current - last) last = current - if current > (1-wsupp): + if current > (1 - wsupp): break if distsupp[-1] < distfn.b: distsupp.append(distfn.b) - distmass.append(1-last) + distmass.append(1 - last) distsupp = np.array(distsupp) distmass = np.array(distmass) # convert intervals to right-half-open as required by histogram - histsupp = distsupp+1e-8 + histsupp = distsupp + 1e-8 histsupp[0] = distfn.a # find sample frequencies and perform chisquare test - freq,hsupp = np.histogram(rvs,histsupp) - cdfs = distfn.cdf(distsupp,*arg) - (chis,pval) = stats.chisquare(np.array(freq),n*distmass) + freq, hsupp = np.histogram(rvs, histsupp) + chis, pval = stats.chisquare(np.array(freq), len(rvs)*distmass) - npt.assert_(pval > alpha, 'chisquare - test for %s' - ' at arg = %s with pval = %s' % (msg,str(arg),str(pval))) + npt.assert_(pval > alpha, + 'chisquare - test for %s at arg = %s with pval = %s' % + (msg, str(arg), str(pval))) def check_scale_docstring(distfn): diff --git a/wafo/stats/tests/test_distributions.py b/wafo/stats/tests/test_distributions.py index 2fcb252..9025317 100644 --- a/wafo/stats/tests/test_distributions.py +++ b/wafo/stats/tests/test_distributions.py @@ -6,16 +6,16 @@ from __future__ import division, print_function, absolute_import import warnings import re import sys +import pickle from numpy.testing import (TestCase, run_module_suite, assert_equal, assert_array_equal, assert_almost_equal, assert_array_almost_equal, - assert_allclose, assert_, assert_raises, rand, dec) + assert_allclose, assert_, assert_raises, assert_warns, dec) from nose import SkipTest import numpy import numpy as np from numpy import typecodes, array -from scipy._lib._version import NumpyVersion from scipy import special import wafo.stats as stats from wafo.stats._distn_infrastructure import argsreduce @@ -30,18 +30,19 @@ DOCSTRINGS_STRIPPED = sys.flags.optimize > 1 # Generate test cases to test cdf and distribution consistency. # Note that this list does not include all distributions. -dists = ['uniform','norm','lognorm','expon','beta', - 'powerlaw','bradford','burr','fisk','cauchy','halfcauchy', - 'foldcauchy','gamma','gengamma','loggamma', - 'alpha','anglit','arcsine','betaprime', - 'dgamma','exponweib','exponpow','frechet_l','frechet_r', - 'gilbrat','f','ncf','chi2','chi','nakagami','genpareto', - 'genextreme','genhalflogistic','pareto','lomax','halfnorm', - 'halflogistic','fatiguelife','foldnorm','ncx2','t','nct', - 'weibull_min','weibull_max','dweibull','maxwell','rayleigh', - 'genlogistic', 'logistic','gumbel_l','gumbel_r','gompertz', - 'hypsecant', 'laplace', 'reciprocal','triang','tukeylambda', - 'vonmises', 'vonmises_line', 'pearson3'] +dists = ['uniform', 'norm', 'lognorm', 'expon', 'beta', + 'powerlaw', 'bradford', 'burr', 'fisk', 'cauchy', 'halfcauchy', + 'foldcauchy', 'gamma', 'gengamma', 'loggamma', + 'alpha', 'anglit', 'arcsine', 'betaprime', 'dgamma', + 'exponnorm', 'exponweib', 'exponpow', 'frechet_l', 'frechet_r', + 'gilbrat', 'f', 'ncf', 'chi2', 'chi', 'nakagami', 'genpareto', + 'genextreme', 'genhalflogistic', 'pareto', 'lomax', 'halfnorm', + 'halflogistic', 'fatiguelife', 'foldnorm', 'ncx2', 't', 'nct', + 'weibull_min', 'weibull_max', 'dweibull', 'maxwell', 'rayleigh', + 'genlogistic', 'logistic', 'gumbel_l', 'gumbel_r', 'gompertz', + 'hypsecant', 'laplace', 'reciprocal', 'triang', 'tukeylambda', + 'vonmises', 'vonmises_line', 'pearson3', 'gennorm', 'halfgennorm', + 'rice'] def _assert_hasattr(a, b, msg=None): @@ -56,20 +57,15 @@ def test_api_regression(): # check function for test generator - - def check_distribution(dist, args, alpha): - D,pval = stats.kstest(dist,'', args=args, N=1000) + D, pval = stats.kstest(dist, '', args=args, N=1000) if (pval < alpha): - D,pval = stats.kstest(dist,'',args=args, N=1000) - # if (pval < alpha): - # D,pval = stats.kstest(dist,'',args=args, N=1000) + D, pval = stats.kstest(dist, '', args=args, N=1000) assert_(pval > alpha, msg="D = " + str(D) + "; pval = " + str(pval) + - "; alpha = " + str(alpha) + "\nargs = " + str(args)) - -# nose test generator + "; alpha = " + str(alpha) + "\nargs = " + str(args)) +# nose test generator def test_all_distributions(): for dist in dists: distfunc = getattr(stats, dist) @@ -78,37 +74,35 @@ def test_all_distributions(): if dist == 'fatiguelife': alpha = 0.001 - if dist == 'frechet': - args = tuple(2*rand(1))+(0,)+tuple(2*rand(2)) - elif dist == 'triang': - args = tuple(rand(nargs)) + if dist == 'triang': + args = tuple(np.random.random(nargs)) elif dist == 'reciprocal': - vals = rand(nargs) + vals = np.random.random(nargs) vals[1] = vals[0] + 1.0 args = tuple(vals) elif dist == 'vonmises': yield check_distribution, dist, (10,), alpha yield check_distribution, dist, (101,), alpha - args = tuple(1.0+rand(nargs)) + args = tuple(1.0 + np.random.random(nargs)) else: - args = tuple(1.0+rand(nargs)) + args = tuple(1.0 + np.random.random(nargs)) yield check_distribution, dist, args, alpha -def check_vonmises_pdf_periodic(k,l,s,x): - vm = stats.vonmises(k,loc=l,scale=s) - assert_almost_equal(vm.pdf(x),vm.pdf(x % (2*numpy.pi*s))) +def check_vonmises_pdf_periodic(k, l, s, x): + vm = stats.vonmises(k, loc=l, scale=s) + assert_almost_equal(vm.pdf(x), vm.pdf(x % (2*numpy.pi*s))) -def check_vonmises_cdf_periodic(k,l,s,x): - vm = stats.vonmises(k,loc=l,scale=s) - assert_almost_equal(vm.cdf(x) % 1,vm.cdf(x % (2*numpy.pi*s)) % 1) +def check_vonmises_cdf_periodic(k, l, s, x): + vm = stats.vonmises(k, loc=l, scale=s) + assert_almost_equal(vm.cdf(x) % 1, vm.cdf(x % (2*numpy.pi*s)) % 1) def test_vonmises_pdf_periodic(): for k in [0.1, 1, 101]: - for x in [0,1,numpy.pi,10,100]: + for x in [0, 1, numpy.pi, 10, 100]: yield check_vonmises_pdf_periodic, k, 0, 1, x yield check_vonmises_pdf_periodic, k, 1, 1, x yield check_vonmises_pdf_periodic, k, 0, 10, x @@ -123,31 +117,36 @@ def test_vonmises_line_support(): assert_equal(stats.vonmises_line.b, np.pi) +def test_vonmises_numerical(): + vm = stats.vonmises(800) + assert_almost_equal(vm.cdf(0), 0.5) + + class TestRandInt(TestCase): def test_rvs(self): - vals = stats.randint.rvs(5,30,size=100) + vals = stats.randint.rvs(5, 30, size=100) assert_(numpy.all(vals < 30) & numpy.all(vals >= 5)) assert_(len(vals) == 100) - vals = stats.randint.rvs(5,30,size=(2,50)) - assert_(numpy.shape(vals) == (2,50)) + vals = stats.randint.rvs(5, 30, size=(2, 50)) + assert_(numpy.shape(vals) == (2, 50)) assert_(vals.dtype.char in typecodes['AllInteger']) - val = stats.randint.rvs(15,46) + val = stats.randint.rvs(15, 46) assert_((val >= 15) & (val < 46)) assert_(isinstance(val, numpy.ScalarType), msg=repr(type(val))) - val = stats.randint(15,46).rvs(3) + val = stats.randint(15, 46).rvs(3) assert_(val.dtype.char in typecodes['AllInteger']) def test_pdf(self): k = numpy.r_[0:36] out = numpy.where((k >= 5) & (k < 30), 1.0/(30-5), 0) - vals = stats.randint.pmf(k,5,30) - assert_array_almost_equal(vals,out) + vals = stats.randint.pmf(k, 5, 30) + assert_array_almost_equal(vals, out) def test_cdf(self): x = numpy.r_[0:36:100j] k = numpy.floor(x) - out = numpy.select([k >= 30,k >= 5],[1.0,(k-5.0+1)/(30-5.0)],0) - vals = stats.randint.cdf(x,5,30) + out = numpy.select([k >= 30, k >= 5], [1.0, (k-5.0+1)/(30-5.0)], 0) + vals = stats.randint.cdf(x, 5, 30) assert_array_almost_equal(vals, out, decimal=12) @@ -165,8 +164,8 @@ class TestBinom(TestCase): def test_pmf(self): # regression test for Ticket #1842 - vals1 = stats.binom.pmf(100, 100,1) - vals2 = stats.binom.pmf(0, 100,0) + vals1 = stats.binom.pmf(100, 100, 1) + vals2 = stats.binom.pmf(0, 100, 0) assert_allclose(vals1, 1.0, rtol=1e-15, atol=0) assert_allclose(vals2, 1.0, rtol=1e-15, atol=0) @@ -238,6 +237,9 @@ class TestNBinom(TestCase): # regression test for ticket 1779 assert_allclose(np.exp(stats.nbinom.logpmf(700, 721, 0.52)), stats.nbinom.pmf(700, 721, 0.52)) + # logpmf(0,1,1) shouldn't return nan (regression test for gh-4029) + val = stats.nbinom.logpmf(0, 1, 1) + assert_equal(val, 0) class TestGeom(TestCase): @@ -253,15 +255,19 @@ class TestGeom(TestCase): assert_(val.dtype.char in typecodes['AllInteger']) def test_pmf(self): - vals = stats.geom.pmf([1,2,3],0.5) - assert_array_almost_equal(vals,[0.5,0.25,0.125]) + vals = stats.geom.pmf([1, 2, 3], 0.5) + assert_array_almost_equal(vals, [0.5, 0.25, 0.125]) def test_logpmf(self): # regression test for ticket 1793 - vals1 = np.log(stats.geom.pmf([1,2,3], 0.5)) - vals2 = stats.geom.logpmf([1,2,3], 0.5) + vals1 = np.log(stats.geom.pmf([1, 2, 3], 0.5)) + vals2 = stats.geom.logpmf([1, 2, 3], 0.5) assert_allclose(vals1, vals2, rtol=1e-15, atol=0) + # regression test for gh-4028 + val = stats.geom.logpmf(1, 1) + assert_equal(val, 0.0) + def test_cdf_sf(self): vals = stats.geom.cdf([1, 2, 3], 0.5) vals_sf = stats.geom.sf([1, 2, 3], 0.5) @@ -282,15 +288,54 @@ class TestGeom(TestCase): assert_array_almost_equal(vals, expected) +class TestGennorm(TestCase): + def test_laplace(self): + # test against Laplace (special case for beta=1) + points = [1, 2, 3] + pdf1 = stats.gennorm.pdf(points, 1) + pdf2 = stats.laplace.pdf(points) + assert_almost_equal(pdf1, pdf2) + + def test_norm(self): + # test against normal (special case for beta=2) + points = [1, 2, 3] + pdf1 = stats.gennorm.pdf(points, 2) + pdf2 = stats.norm.pdf(points, scale=2**-.5) + assert_almost_equal(pdf1, pdf2) + + +class TestHalfgennorm(TestCase): + def test_expon(self): + # test against exponential (special case for beta=1) + points = [1, 2, 3] + pdf1 = stats.halfgennorm.pdf(points, 1) + pdf2 = stats.expon.pdf(points) + assert_almost_equal(pdf1, pdf2) + + def test_halfnorm(self): + # test against half normal (special case for beta=2) + points = [1, 2, 3] + pdf1 = stats.halfgennorm.pdf(points, 2) + pdf2 = stats.halfnorm.pdf(points, scale=2**-.5) + assert_almost_equal(pdf1, pdf2) + + def test_gennorm(self): + # test against generalized normal + points = [1, 2, 3] + pdf1 = stats.halfgennorm.pdf(points, .497324) + pdf2 = stats.gennorm.pdf(points, .497324) + assert_almost_equal(pdf1, 2*pdf2) + + class TestTruncnorm(TestCase): def test_ppf_ticket1131(self): - vals = stats.truncnorm.ppf([-0.5,0,1e-4,0.5, 1-1e-4,1,2], -1., 1., - loc=[3]*7, scale=2) + vals = stats.truncnorm.ppf([-0.5, 0, 1e-4, 0.5, 1-1e-4, 1, 2], -1., 1., + loc=[3]*7, scale=2) expected = np.array([np.nan, 1, 1.00056419, 3, 4.99943581, 5, np.nan]) assert_array_almost_equal(vals, expected) def test_isf_ticket1131(self): - vals = stats.truncnorm.isf([-0.5,0,1e-4,0.5, 1-1e-4,1,2], -1., 1., + vals = stats.truncnorm.isf([-0.5, 0, 1e-4, 0.5, 1-1e-4, 1, 2], -1., 1., loc=[3]*7, scale=2) expected = np.array([np.nan, 5, 4.99943581, 3, 1.00056419, 1, np.nan]) assert_array_almost_equal(vals, expected) @@ -323,7 +368,7 @@ class TestHypergeom(TestCase): def test_rvs(self): vals = stats.hypergeom.rvs(20, 10, 3, size=(2, 50)) assert_(numpy.all(vals >= 0) & - numpy.all(vals <= 3)) + numpy.all(vals <= 3)) assert_(numpy.shape(vals) == (2, 50)) assert_(vals.dtype.char in typecodes['AllInteger']) val = stats.hypergeom.rvs(20, 3, 10) @@ -342,6 +387,15 @@ class TestHypergeom(TestCase): hgpmf = stats.hypergeom.pmf(2, tot, good, N) assert_almost_equal(hgpmf, 0.0010114963068932233, 11) + def test_args(self): + # test correct output for corner cases of arguments + # see gh-2325 + assert_almost_equal(stats.hypergeom.pmf(0, 2, 1, 0), 1.0, 11) + assert_almost_equal(stats.hypergeom.pmf(1, 2, 1, 0), 0.0, 11) + + assert_almost_equal(stats.hypergeom.pmf(0, 2, 0, 2), 1.0, 11) + assert_almost_equal(stats.hypergeom.pmf(1, 2, 1, 0), 0.0, 11) + def test_cdf_above_one(self): # for some values of parameters, hypergeom cdf was >1, see gh-2238 assert_(0 <= stats.hypergeom.cdf(30, 13397950, 4363, 12390) <= 1.0) @@ -355,7 +409,8 @@ class TestHypergeom(TestCase): quantile = 2e4 res = [] for eaten in fruits_eaten: - res.append(stats.hypergeom.sf(quantile, oranges + pears, oranges, eaten)) + res.append(stats.hypergeom.sf(quantile, oranges + pears, oranges, + eaten)) expected = np.array([0, 1.904153e-114, 2.752693e-66, 4.931217e-32, 8.265601e-11, 0.1237904, 1]) assert_allclose(res, expected, atol=0, rtol=5e-7) @@ -378,6 +433,21 @@ class TestHypergeom(TestCase): h = hg.entropy() assert_equal(h, 0.0) + def test_logsf(self): + # Test logsf for very large numbers. See issue #4982 + # Results compare with those from R (v3.2.0): + # phyper(k, n, M-n, N, lower.tail=FALSE, log.p=TRUE) + # -2239.771 + + k = 1e4 + M = 1e7 + n = 1e6 + N = 5e4 + + result = stats.hypergeom.logsf(k, M, n, N) + exspected = -2239.771 # From R + assert_almost_equal(result, exspected, decimal=3) + class TestLoggamma(TestCase): @@ -566,7 +636,11 @@ class TestGenpareto(TestCase): 1. - np.logspace(1e-12, 0.01, base=0.1)] for c in [1e-8, -1e-18, 1e-15, -1e-15]: assert_allclose(stats.genpareto.cdf(stats.genpareto.ppf(q, c), c), - q, atol=1e-15) + q, atol=1e-15) + + def test_logsf(self): + logp = stats.genpareto.logsf(1e10, .01, 0, 1) + assert_allclose(logp, -1842.0680753952365) class TestPearson3(TestCase): @@ -587,7 +661,7 @@ class TestPearson3(TestCase): atol=1e-6) vals = stats.pearson3.pdf(-3, 0.1) assert_allclose(vals, np.array([0.00313791]), atol=1e-6) - vals = stats.pearson3.pdf([-3,-2,-1,0,1], 0.1) + vals = stats.pearson3.pdf([-3, -2, -1, 0, 1], 0.1) assert_allclose(vals, np.array([0.00313791, 0.05192304, 0.25028092, 0.39885918, 0.23413173]), atol=1e-6) @@ -597,12 +671,29 @@ class TestPearson3(TestCase): atol=1e-6) vals = stats.pearson3.cdf(-3, 0.1) assert_allclose(vals, [0.00082256], atol=1e-6) - vals = stats.pearson3.cdf([-3,-2,-1,0,1], 0.1) + vals = stats.pearson3.cdf([-3, -2, -1, 0, 1], 0.1) assert_allclose(vals, [8.22563821e-04, 1.99860448e-02, 1.58550710e-01, 5.06649130e-01, 8.41442111e-01], atol=1e-6) class TestPoisson(TestCase): + + def test_pmf_basic(self): + # Basic case + ln2 = np.log(2) + vals = stats.poisson.pmf([0, 1, 2], ln2) + expected = [0.5, ln2/2, ln2**2/4] + assert_allclose(vals, expected) + + def test_mu0(self): + # Edge case: mu=0 + vals = stats.poisson.pmf([0, 1, 2], 0) + expected = [1, 0, 0] + assert_array_equal(vals, expected) + + interval = stats.poisson.interval(0.95, 0) + assert_equal(interval, (0, 0)) + def test_rvs(self): vals = stats.poisson.rvs(0.5, size=(2, 50)) assert_(numpy.all(vals >= 0)) @@ -619,6 +710,11 @@ class TestPoisson(TestCase): result = stats.poisson.stats(mu, moments='mvsk') assert_allclose(result, [mu, mu, np.sqrt(1.0/mu), 1.0/mu]) + mu = np.array([0.0, 1.0, 2.0]) + result = stats.poisson.stats(mu, moments='mvsk') + expected = (mu, mu, [np.inf, 1, 1/np.sqrt(2)], [np.inf, 1, 0.5]) + assert_allclose(result, expected) + class TestZipf(TestCase): def test_rvs(self): @@ -664,28 +760,27 @@ class TestDLaplace(TestCase): xx = np.arange(-N, N+1) pp = dl.pmf(xx) m2, m4 = np.sum(pp*xx**2), np.sum(pp*xx**4) - assert_equal((m, s), (0,0)) + assert_equal((m, s), (0, 0)) assert_allclose((v, k), (m2, m4/m2**2 - 3.), atol=1e-14, rtol=1e-8) def test_stats2(self): a = np.log(2.) dl = stats.dlaplace(a) m, v, s, k = dl.stats('mvsk') - assert_equal((m, s), (0.,0.)) + assert_equal((m, s), (0., 0.)) assert_allclose((v, k), (4., 3.25)) class TestInvGamma(TestCase): - @dec.skipif(NumpyVersion(np.__version__) < '1.7.0', - "assert_* funcs broken with inf/nan") def test_invgamma_inf_gh_1866(self): # invgamma's moments are only finite for a>n # specific numbers checked w/ boost 1.54 with warnings.catch_warnings(): warnings.simplefilter('error', RuntimeWarning) mvsk = stats.invgamma.stats(a=19.31, moments='mvsk') - assert_allclose(mvsk, - [0.05461496450, 0.0001723162534, 1.020362676, 2.055616582]) + expected = [0.05461496450, 0.0001723162534, 1.020362676, + 2.055616582] + assert_allclose(mvsk, expected) a = [1.1, 3.1, 5.6] mvsk = stats.invgamma.stats(a=a, moments='mvsk') @@ -712,11 +807,10 @@ class TestF(TestCase): warnings.simplefilter('error', RuntimeWarning) stats.f.stats(dfn=[11]*4, dfd=[2, 4, 6, 8], moments='mvsk') - @dec.knownfailureif(True, 'f stats does not properly broadcast') def test_stats_broadcast(self): - # stats do not fully broadcast just yet - mv = stats.f.stats(dfn=11, dfd=[11, 12]) - + m, v = stats.f.stats(dfn=11, dfd=[11, 12]) + assert_array_almost_equal(m, [1.22222222, 1.2]) + assert_array_almost_equal(v, [0.77601411, 0.68727273]) def test_rvgeneric_std(): # Regression test for #1191 @@ -725,14 +819,14 @@ def test_rvgeneric_std(): class TestRvDiscrete(TestCase): def test_rvs(self): - states = [-1,0,1,2,3,4] - probability = [0.0,0.3,0.4,0.0,0.3,0.0] + states = [-1, 0, 1, 2, 3, 4] + probability = [0.0, 0.3, 0.4, 0.0, 0.3, 0.0] samples = 1000 - r = stats.rv_discrete(name='sample',values=(states,probability)) + r = stats.rv_discrete(name='sample', values=(states, probability)) x = r.rvs(size=samples) assert_(isinstance(x, numpy.ndarray)) - for s,p in zip(states,probability): + for s, p in zip(states, probability): assert_(abs(sum(x == s)/float(samples) - p) < 0.05) x = r.rvs() @@ -751,22 +845,82 @@ class TestRvDiscrete(TestCase): assert_equal(h, 0.0) +class TestSkewNorm(TestCase): + + def test_normal(self): + # When the skewness is 0 the distribution is normal + x = np.linspace(-5, 5, 100) + assert_array_almost_equal(stats.skewnorm.pdf(x, a=0), + stats.norm.pdf(x)) + + def test_rvs(self): + shape = (3, 4, 5) + x = stats.skewnorm.rvs(a=0.75, size=shape) + assert_equal(shape, x.shape) + + x = stats.skewnorm.rvs(a=-3, size=shape) + assert_equal(shape, x.shape) + + def test_moments(self): + X = stats.skewnorm.rvs(a=4, size=int(1e6), loc=5, scale=2) + assert_array_almost_equal([np.mean(X), np.var(X), stats.skew(X), stats.kurtosis(X)], + stats.skewnorm.stats(a=4, loc=5, scale=2, moments='mvsk'), + decimal=2) + + X = stats.skewnorm.rvs(a=-4, size=int(1e6), loc=5, scale=2) + assert_array_almost_equal([np.mean(X), np.var(X), stats.skew(X), stats.kurtosis(X)], + stats.skewnorm.stats(a=-4, loc=5, scale=2, moments='mvsk'), + decimal=2) + class TestExpon(TestCase): def test_zero(self): - assert_equal(stats.expon.pdf(0),1) + assert_equal(stats.expon.pdf(0), 1) def test_tail(self): # Regression test for ticket 807 assert_equal(stats.expon.cdf(1e-18), 1e-18) assert_equal(stats.expon.isf(stats.expon.sf(40)), 40) +class TestExponNorm(TestCase): + def test_moments(self): + # Some moment test cases based on non-loc/scaled formula + def get_moms(lam, sig, mu): + # See wikipedia for these formulae + # where it is listed as an exponentially modified gaussian + opK2 = 1.0 + 1 / (lam*sig)**2 + exp_skew = 2 / (lam * sig)**3 * opK2**(-1.5) + exp_kurt = 6.0 * (1 + (lam * sig)**2)**(-2) + return [mu + 1/lam, sig*sig + 1.0/(lam*lam), exp_skew, exp_kurt] + + mu, sig, lam = 0, 1, 1 + K = 1.0 / (lam * sig) + sts = stats.exponnorm.stats(K, loc=mu, scale=sig, moments='mvsk') + assert_almost_equal(sts, get_moms(lam, sig, mu)) + mu, sig, lam = -3, 2, 0.1 + K = 1.0 / (lam * sig) + sts = stats.exponnorm.stats(K, loc=mu, scale=sig, moments='mvsk') + assert_almost_equal(sts, get_moms(lam, sig, mu)) + mu, sig, lam = 0, 3, 1 + K = 1.0 / (lam * sig) + sts = stats.exponnorm.stats(K, loc=mu, scale=sig, moments='mvsk') + assert_almost_equal(sts, get_moms(lam, sig, mu)) + mu, sig, lam = -5, 11, 3.5 + K = 1.0 / (lam * sig) + sts = stats.exponnorm.stats(K, loc=mu, scale=sig, moments='mvsk') + assert_almost_equal(sts, get_moms(lam, sig, mu)) + + def test_extremes_x(self): + # Test for extreme values against overflows + assert_almost_equal(stats.exponnorm.pdf(-900, 1), 0.0) + assert_almost_equal(stats.exponnorm.pdf(+900, 1), 0.0) + + class TestGenExpon(TestCase): def test_pdf_unity_area(self): from scipy.integrate import simps # PDF should integrate to one - assert_almost_equal(simps(stats.genexpon.pdf(numpy.arange(0,10,0.01), - 0.5, 0.5, 2.0), - dx=0.01), 1, 1) + p = stats.genexpon.pdf(numpy.arange(0, 10, 0.01), 0.5, 0.5, 2.0) + assert_almost_equal(simps(p, dx=0.01), 1, 1) def test_cdf_bounds(self): # CDF should always be positive @@ -777,7 +931,8 @@ class TestGenExpon(TestCase): class TestExponpow(TestCase): def test_tail(self): assert_almost_equal(stats.exponpow.cdf(1e-10, 2.), 1e-20) - assert_almost_equal(stats.exponpow.isf(stats.exponpow.sf(5, .8), .8), 5) + assert_almost_equal(stats.exponpow.isf(stats.exponpow.sf(5, .8), .8), + 5) class TestSkellam(TestCase): @@ -827,7 +982,9 @@ class TestSkellam(TestCase): class TestLognorm(TestCase): def test_pdf(self): # Regression test for Ticket #1471: avoid nan with 0/0 situation - with np.errstate(divide='ignore'): + # Also make sure there are no warnings at x=0, cf gh-5202 + with warnings.catch_warnings(): + warnings.simplefilter('error', RuntimeWarning) pdf = stats.lognorm.pdf([0, 0.5, 1], 1) assert_array_almost_equal(pdf, [0.0, 0.62749608, 0.39894228]) @@ -835,9 +992,9 @@ class TestLognorm(TestCase): class TestBeta(TestCase): def test_logpdf(self): # Regression test for Ticket #1326: avoid nan with 0*log(0) situation - logpdf = stats.beta.logpdf(0,1,0.5) + logpdf = stats.beta.logpdf(0, 1, 0.5) assert_almost_equal(logpdf, -0.69314718056) - logpdf = stats.beta.logpdf(0,0.5,1) + logpdf = stats.beta.logpdf(0, 0.5, 1) assert_almost_equal(logpdf, np.inf) def test_logpdf_ticket_1866(self): @@ -856,6 +1013,22 @@ class TestBetaPrime(TestCase): assert_(np.isfinite(b.logpdf(x)).all()) assert_allclose(b.pdf(x), np.exp(b.logpdf(x))) + def test_cdf(self): + # regression test for gh-4030: Implementation of + # scipy.stats.betaprime.cdf() + x = stats.betaprime.cdf(0, 0.2, 0.3) + assert_equal(x, 0.0) + + alpha, beta = 267, 1472 + x = np.array([0.2, 0.5, 0.6]) + cdfs = stats.betaprime.cdf(x, alpha, beta) + assert_(np.isfinite(cdfs).all()) + + # check the new cdf implementation vs generic one: + gen_cdf = stats.rv_continuous._cdf_single + cdfs_g = [gen_cdf(stats.betaprime, val, alpha, beta) for val in x] + assert_allclose(cdfs, cdfs_g, atol=0, rtol=2e-12) + class TestGamma(TestCase): def test_pdf(self): @@ -869,21 +1042,24 @@ class TestGamma(TestCase): def test_logpdf(self): # Regression test for Ticket #1326: cornercase avoid nan with 0*log(0) # situation - logpdf = stats.gamma.logpdf(0,1) + logpdf = stats.gamma.logpdf(0, 1) assert_almost_equal(logpdf, 0) class TestChi2(TestCase): # regression tests after precision improvements, ticket:1041, not verified def test_precision(self): - assert_almost_equal(stats.chi2.pdf(1000, 1000), 8.919133934753128e-003, 14) - assert_almost_equal(stats.chi2.pdf(100, 100), 0.028162503162596778, 14) + assert_almost_equal(stats.chi2.pdf(1000, 1000), 8.919133934753128e-003, + decimal=14) + assert_almost_equal(stats.chi2.pdf(100, 100), 0.028162503162596778, + decimal=14) class TestArrayArgument(TestCase): # test for ticket:992 def test_noexception(self): - rvs = stats.norm.rvs(loc=(np.arange(5)), scale=np.ones(5), size=(10,5)) - assert_equal(rvs.shape, (10,5)) + rvs = stats.norm.rvs(loc=(np.arange(5)), scale=np.ones(5), + size=(10, 5)) + assert_equal(rvs.shape, (10, 5)) class TestDocstring(TestCase): @@ -903,10 +1079,10 @@ class TestDocstring(TestCase): class TestEntropy(TestCase): def test_entropy_positive(self): # See ticket #497 - pk = [0.5,0.2,0.3] - qk = [0.1,0.25,0.65] - eself = stats.entropy(pk,pk) - edouble = stats.entropy(pk,qk) + pk = [0.5, 0.2, 0.3] + qk = [0.1, 0.25, 0.65] + eself = stats.entropy(pk, pk) + edouble = stats.entropy(pk, qk) assert_(0.0 == eself) assert_(edouble >= 0.0) @@ -930,33 +1106,31 @@ class TestEntropy(TestCase): pk = [[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]] qk = [[0.2, 0.1], [0.3, 0.6], [0.5, 0.3]] assert_array_almost_equal(stats.entropy(pk, qk), - [0.1933259, 0.18609809]) + [0.1933259, 0.18609809]) - @dec.skipif(NumpyVersion(np.__version__) < '1.7.0', - "assert_* funcs broken with inf/nan") def test_entropy_2d_zero(self): pk = [[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]] qk = [[0.0, 0.1], [0.3, 0.6], [0.5, 0.3]] assert_array_almost_equal(stats.entropy(pk, qk), - [np.inf, 0.18609809]) + [np.inf, 0.18609809]) pk[0][0] = 0.0 assert_array_almost_equal(stats.entropy(pk, qk), - [0.17403988, 0.18609809]) + [0.17403988, 0.18609809]) def TestArgsreduce(): - a = array([1,3,2,1,2,3,3]) - b,c = argsreduce(a > 1, a, 2) + a = array([1, 3, 2, 1, 2, 3, 3]) + b, c = argsreduce(a > 1, a, 2) - assert_array_equal(b, [3,2,2,3,3]) - assert_array_equal(c, [2,2,2,2,2]) + assert_array_equal(b, [3, 2, 2, 3, 3]) + assert_array_equal(c, [2, 2, 2, 2, 2]) - b,c = argsreduce(2 > 1, a, 2) + b, c = argsreduce(2 > 1, a, 2) assert_array_equal(b, a[0]) assert_array_equal(c, [2]) - b,c = argsreduce(a > 0, a, 2) + b, c = argsreduce(a > 0, a, 2) assert_array_equal(b, a) assert_array_equal(c, [2] * numpy.size(a)) @@ -971,7 +1145,7 @@ class TestFitMethod(object): raise SkipTest("%s fit known to fail" % dist) distfunc = getattr(stats, dist) with np.errstate(all='ignore'): - res = distfunc.rvs(*args, **{'size':200}) + res = distfunc.rvs(*args, **{'size': 200}) vals = distfunc.fit(res) vals2 = distfunc.fit(res, optimizer='powell') # Only check the length of the return @@ -996,9 +1170,9 @@ class TestFitMethod(object): raise SkipTest("%s fit known to fail" % dist) distfunc = getattr(stats, dist) with np.errstate(all='ignore'): - res = distfunc.rvs(*args, **{'size':200}) - vals = distfunc.fit(res,floc=0) - vals2 = distfunc.fit(res,fscale=1) + res = distfunc.rvs(*args, **{'size': 200}) + vals = distfunc.fit(res, floc=0) + vals2 = distfunc.fit(res, fscale=1) assert_(len(vals) == 2+len(args)) assert_(vals[-2] == 0) assert_(vals2[-1] == 1) @@ -1023,9 +1197,9 @@ class TestFitMethod(object): # Regression test for #1551. np.random.seed(12345) with np.errstate(all='ignore'): - x = stats.lognorm.rvs(0.25, 0., 20.0, size=20) + x = stats.lognorm.rvs(0.25, 0., 20.0, size=50000) assert_allclose(np.array(stats.lognorm.fit(x, floc=0, fscale=20)), - [0.25888672, 0, 20], atol=1e-5) + [0.25, 0, 20], atol=1e-2) def test_fix_fit_norm(self): x = np.arange(1, 6) @@ -1099,7 +1273,7 @@ class TestFitMethod(object): a, b, loc, scale = stats.beta.fit(x, floc=0, fscale=1) assert_equal(loc, 0) assert_equal(scale, 1) - assert_allclose(mlefunc(a, b, x), [0,0], atol=1e-6) + assert_allclose(mlefunc(a, b, x), [0, 0], atol=1e-6) # Basic test with f0, floc and fscale given. # This is also a regression test for gh-2514. @@ -1133,11 +1307,57 @@ class TestFitMethod(object): # Check that attempting to fix all the parameters raises a ValueError. assert_raises(ValueError, stats.beta.fit, y, f0=0, f1=1, - floc=2, fscale=3) + floc=2, fscale=3) + + def test_fshapes(self): + # take a beta distribution, with shapes='a, b', and make sure that + # fa is equivalent to f0, and fb is equivalent to f1 + a, b = 3., 4. + x = stats.beta.rvs(a, b, size=100, random_state=1234) + res_1 = stats.beta.fit(x, f0=3.) + res_2 = stats.beta.fit(x, fa=3.) + assert_allclose(res_1, res_2, atol=1e-12, rtol=1e-12) + + res_2 = stats.beta.fit(x, fix_a=3.) + assert_allclose(res_1, res_2, atol=1e-12, rtol=1e-12) + + res_3 = stats.beta.fit(x, f1=4.) + res_4 = stats.beta.fit(x, fb=4.) + assert_allclose(res_3, res_4, atol=1e-12, rtol=1e-12) + + res_4 = stats.beta.fit(x, fix_b=4.) + assert_allclose(res_3, res_4, atol=1e-12, rtol=1e-12) + + # cannot specify both positional and named args at the same time + assert_raises(ValueError, stats.beta.fit, x, fa=1, f0=2) + + # check that attempting to fix all parameters raises a ValueError + assert_raises(ValueError, stats.beta.fit, x, fa=0, f1=1, + floc=2, fscale=3) + + # check that specifying floc, fscale and fshapes works for + # beta and gamma which override the generic fit method + res_5 = stats.beta.fit(x, fa=3., floc=0, fscale=1) + aa, bb, ll, ss = res_5 + assert_equal([aa, ll, ss], [3., 0, 1]) + + # gamma distribution + a = 3. + data = stats.gamma.rvs(a, size=100) + aa, ll, ss = stats.gamma.fit(data, fa=a) + assert_equal(aa, a) + + def test_extra_params(self): + # unknown parameters should raise rather than be silently ignored + dist = stats.exponnorm + data = dist.rvs(K=2, size=100) + dct = dict(enikibeniki=-101) + assert_raises(TypeError, dist.fit, data, **dct) class TestFrozen(TestCase): - # Test that a frozen distribution gives the same results as the original object. + # Test that a frozen distribution gives the same results as the original + # object. # # Only tested for the normal distribution (with loc and scale specified) # and for the gamma distribution (with a shape parameter specified). @@ -1186,9 +1406,12 @@ class TestFrozen(TestCase): assert_equal(result_f, result) result_f = frozen.moment(2) - result = dist.moment(2,loc=10.0, scale=3.0) + result = dist.moment(2, loc=10.0, scale=3.0) assert_equal(result_f, result) + assert_equal(frozen.a, dist.a) + assert_equal(frozen.b, dist.b) + def test_gamma(self): a = 2.0 dist = stats.gamma @@ -1238,6 +1461,9 @@ class TestFrozen(TestCase): result = dist.moment(2, a) assert_equal(result_f, result) + assert_equal(frozen.a, frozen.dist.a) + assert_equal(frozen.b, frozen.dist.b) + def test_regression_ticket_1293(self): # Create a frozen distribution. frozen = stats.lognorm(1) @@ -1264,10 +1490,12 @@ class TestFrozen(TestCase): # for c < 0: a, b = 0, -1/c rv = stats.genpareto(c=-0.1) a, b = rv.dist.a, rv.dist.b - assert_equal([a, b], [0., 10.]) + assert_array_equal([a, b], [0., 10.]) + assert_array_equal([rv.a, rv.b], [0., 10.]) stats.genpareto.pdf(0, c=0.1) # this changes genpareto.b - assert_equal([rv.dist.a, rv.dist.b], [a, b]) + assert_array_equal([rv.dist.a, rv.dist.b], [a, b]) + assert_array_equal([rv.a, rv.b], [a, b]) rv1 = stats.genpareto(c=0.1) assert_(rv1.dist is not rv.dist) @@ -1276,6 +1504,61 @@ class TestFrozen(TestCase): # Regression test for gh-3522 assert_(hasattr(stats.distributions, 'rv_frozen')) + def test_random_state(self): + # only check that the random_state attribute exists, + frozen = stats.norm() + assert_(hasattr(frozen, 'random_state')) + + # ... that it can be set, + frozen.random_state = 42 + assert_equal(frozen.random_state.get_state(), + np.random.RandomState(42).get_state()) + + # ... and that .rvs method accepts it as an argument + rndm = np.random.RandomState(1234) + frozen.rvs(size=8, random_state=rndm) + +# def test_pickling(self): +# # test that a frozen instance pickles and unpickles +# # (this method is a clone of common_tests.check_pickling) +# beta = stats.beta(2.3098496451481823, 0.62687954300963677) +# poiss = stats.poisson(3.) +# sample = stats.rv_discrete(values=([0, 1, 2, 3], +# [0.1, 0.2, 0.3, 0.4])) +# +# for distfn in [beta, poiss, sample]: +# distfn.random_state = 1234 +# distfn.rvs(size=8) +# s = pickle.dumps(distfn) +# r0 = distfn.rvs(size=8) +# +# unpickled = pickle.loads(s) +# r1 = unpickled.rvs(size=8) +# assert_equal(r0, r1) +# +# # also smoke test some methods +# medians = [distfn.ppf(0.5), unpickled.ppf(0.5)] +# assert_equal(medians[0], medians[1]) +# assert_equal(distfn.cdf(medians[0]), +# unpickled.cdf(medians[1])) + + def test_expect(self): + # smoke test the expect method of the frozen distribution + # only take a gamma w/loc and scale and poisson with loc specified + def func(x): + return x + + gm = stats.gamma(2, loc=3, scale=4) + gm_val = gm.expect(func, lb=1, ub=2, conditional=True) + gamma_val = stats.gamma.expect(func, args=(2,), loc=3, scale=4, + lb=1, ub=2, conditional=True) + assert_allclose(gm_val, gamma_val) + + p = stats.poisson(3, loc=4) + p_val = p.expect(func) + poisson_val = stats.poisson.expect(func, args=(3,), loc=4) + assert_allclose(p_val, poisson_val) + class TestExpect(TestCase): # Test for expect method. @@ -1300,20 +1583,20 @@ class TestExpect(TestCase): def test_beta(self): # case with finite support interval - v = stats.beta.expect(lambda x: (x-19/3.)*(x-19/3.), args=(10,5), + v = stats.beta.expect(lambda x: (x-19/3.)*(x-19/3.), args=(10, 5), loc=5, scale=2) assert_almost_equal(v, 1./18., decimal=13) - m = stats.beta.expect(lambda x: x, args=(10,5), loc=5., scale=2.) + m = stats.beta.expect(lambda x: x, args=(10, 5), loc=5., scale=2.) assert_almost_equal(m, 19/3., decimal=13) ub = stats.beta.ppf(0.95, 10, 10, loc=5, scale=2) lb = stats.beta.ppf(0.05, 10, 10, loc=5, scale=2) - prob90 = stats.beta.expect(lambda x: 1., args=(10,10), loc=5., - scale=2.,lb=lb, ub=ub, conditional=False) + prob90 = stats.beta.expect(lambda x: 1., args=(10, 10), loc=5., + scale=2., lb=lb, ub=ub, conditional=False) assert_almost_equal(prob90, 0.9, decimal=13) - prob90c = stats.beta.expect(lambda x: 1, args=(10,10), loc=5, + prob90c = stats.beta.expect(lambda x: 1, args=(10, 10), loc=5, scale=2, lb=lb, ub=ub, conditional=True) assert_almost_equal(prob90c, 1., decimal=13) @@ -1330,19 +1613,20 @@ class TestExpect(TestCase): assert_almost_equal(v, v_true, decimal=14) # with bounds, bounds equal to shifted support - v_bounds = stats.hypergeom.expect(lambda x: (x-9.)**2, args=(20, 10, 8), + v_bounds = stats.hypergeom.expect(lambda x: (x-9.)**2, + args=(20, 10, 8), loc=5., lb=5, ub=13) assert_almost_equal(v_bounds, v_true, decimal=14) # drop boundary points prob_true = 1-stats.hypergeom.pmf([5, 13], 20, 10, 8, loc=5).sum() prob_bounds = stats.hypergeom.expect(lambda x: 1, args=(20, 10, 8), - loc=5., lb=6, ub=12) + loc=5., lb=6, ub=12) assert_almost_equal(prob_bounds, prob_true, decimal=13) # conditional prob_bc = stats.hypergeom.expect(lambda x: 1, args=(20, 10, 8), loc=5., - lb=6, ub=12, conditional=True) + lb=6, ub=12, conditional=True) assert_almost_equal(prob_bc, 1, decimal=14) # check simple integral @@ -1353,8 +1637,8 @@ class TestExpect(TestCase): def test_poisson(self): # poisson, use lower bound only prob_bounds = stats.poisson.expect(lambda x: 1, args=(2,), lb=3, - conditional=False) - prob_b_true = 1-stats.poisson.cdf(2,2) + conditional=False) + prob_b_true = 1-stats.poisson.cdf(2, 2) assert_almost_equal(prob_bounds, prob_b_true, decimal=14) prob_lb = stats.poisson.expect(lambda x: 1, args=(2,), lb=2, @@ -1380,6 +1664,56 @@ class TestExpect(TestCase): assert_(np.isfinite(stats.rice.expect(lambda x: 2, args=(0.74,)))) assert_(np.isfinite(stats.rice.expect(lambda x: 3, args=(0.74,)))) + def test_logser(self): + # test a discrete distribution with infinite support and loc + p, loc = 0.3, 3 + res_0 = stats.logser.expect(lambda k: k, args=(p,)) + # check against the correct answer (sum of a geom series) + assert_allclose(res_0, + p / (p - 1.) / np.log(1. - p), atol=1e-15) + + # now check it with `loc` + res_l = stats.logser.expect(lambda k: k, args=(p,), loc=loc) + assert_allclose(res_l, res_0 + loc, atol=1e-15) + + def test_skellam(self): + # Use a discrete distribution w/ bi-infinite support. Compute two first + # moments and compare to known values (cf skellam.stats) + p1, p2 = 18, 22 + m1 = stats.skellam.expect(lambda x: x, args=(p1, p2)) + m2 = stats.skellam.expect(lambda x: x**2, args=(p1, p2)) + assert_allclose(m1, p1 - p2, atol=1e-12) + assert_allclose(m2 - m1**2, p1 + p2, atol=1e-12) + + def test_randint(self): + # Use a discrete distribution w/ parameter-dependent support, which + # is larger than the default chunksize + lo, hi = 0, 113 + res = stats.randint.expect(lambda x: x, (lo, hi)) + assert_allclose(res, + sum(_ for _ in range(lo, hi)) / (hi - lo), atol=1e-15) + + def test_zipf(self): + # Test that there is no infinite loop even if the sum diverges + assert_warns(RuntimeWarning, stats.zipf.expect, + lambda x: x**2, (2,)) + + def test_discrete_kwds(self): + # check that discrete expect accepts keywords to control the summation + n0 = stats.poisson.expect(lambda x: 1, args=(2,)) + + assert_almost_equal(n0, 1, decimal=14) + + def test_moment(self): + # test the .moment() method: compute a higher moment and compare to + # a known value + def poiss_moment5(mu): + return mu**5 + 10*mu**4 + 25*mu**3 + 15*mu**2 + mu + + for mu in [5, 7]: + m5 = stats.poisson.moment(5, mu) + assert_allclose(m5, poiss_moment5(mu), rtol=1e-10) + class TestNct(TestCase): def test_nc_parameter(self): @@ -1391,7 +1725,8 @@ class TestNct(TestCase): assert_almost_equal(rv.cdf(0), 0.841344746069, decimal=10) def test_broadcasting(self): - res = stats.nct.pdf(5, np.arange(4,7)[:,None], np.linspace(0.1, 1, 4)) + res = stats.nct.pdf(5, np.arange(4, 7)[:, None], + np.linspace(0.1, 1, 4)) expected = array([[0.00321886, 0.00557466, 0.00918418, 0.01442997], [0.00217142, 0.00395366, 0.00683888, 0.01126276], [0.00153078, 0.00291093, 0.00525206, 0.00900815]]) @@ -1436,7 +1771,7 @@ class TestRice(TestCase): # see e.g. Abramovich & Stegun 9.6.7 & 9.6.10 b = 1e-8 assert_allclose(stats.rice.pdf(x, 0), stats.rice.pdf(x, b), - atol=b, rtol=0) + atol=b, rtol=0) def test_rice_rvs(self): rvs = stats.rice.rvs @@ -1451,9 +1786,10 @@ class TestErlang(TestCase): with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) - # The non-integer shape parameter 1.3 should trigger a RuntimeWarning + # The non-integer shape parameter 1.3 should trigger a + # RuntimeWarning assert_raises(RuntimeWarning, - stats.erlang.rvs, 1.3, loc=0, scale=1, size=4) + stats.erlang.rvs, 1.3, loc=0, scale=1, size=4) # Calling the fit method with `f0` set to an integer should # *not* trigger a RuntimeWarning. It should return the same @@ -1515,18 +1851,19 @@ class TestRdist(TestCase): distfn = stats.rdist values = [0.001, 0.5, 0.999] assert_almost_equal(distfn.cdf(distfn.ppf(values, 541.0), 541.0), - values, decimal=5) + values, decimal=5) def test_540_567(): # test for nan returned in tickets 540, 567 - assert_almost_equal(stats.norm.cdf(-1.7624320982),0.03899815971089126, - decimal=10, err_msg='test_540_567') - assert_almost_equal(stats.norm.cdf(-1.7624320983),0.038998159702449846, - decimal=10, err_msg='test_540_567') + assert_almost_equal(stats.norm.cdf(-1.7624320982), 0.03899815971089126, + decimal=10, err_msg='test_540_567') + assert_almost_equal(stats.norm.cdf(-1.7624320983), 0.038998159702449846, + decimal=10, err_msg='test_540_567') assert_almost_equal(stats.norm.cdf(1.38629436112, loc=0.950273420309, - scale=0.204423758009),0.98353464004309321, - decimal=10, err_msg='test_540_567') + scale=0.204423758009), + 0.98353464004309321, + decimal=10, err_msg='test_540_567') def test_regression_ticket_1316(): @@ -1541,7 +1878,8 @@ def test_regression_ticket_1326(): def test_regression_tukey_lambda(): - # Make sure that Tukey-Lambda distribution correctly handles non-positive lambdas. + # Make sure that Tukey-Lambda distribution correctly handles + # non-positive lambdas. x = np.linspace(-5.0, 5.0, 101) olderr = np.seterr(divide='ignore') @@ -1630,6 +1968,16 @@ def test_regression_ticket_1530(): assert_almost_equal(params, expected, decimal=1) +def test_gh_pr_4806(): + # Check starting values for Cauchy distribution fit. + np.random.seed(1234) + x = np.random.randn(42) + for offset in 10000.0, 1222333444.0: + loc, scale = stats.cauchy.fit(x + offset) + assert_allclose(loc, offset, atol=1.0) + assert_allclose(scale, 0.6, atol=1.0) + + def test_tukeylambda_stats_ticket_1545(): # Some test for the variance and kurtosis of the Tukey Lambda distr. # See test_tukeylamdba_stats.py for more tests. @@ -1692,6 +2040,37 @@ def test_powerlaw_stats(): assert_array_almost_equal(mvsk, exact_mvsk) +def test_powerlaw_edge(): + # Regression test for gh-3986. + p = stats.powerlaw.logpdf(0, 1) + assert_equal(p, 0.0) + + +def test_exponpow_edge(): + # Regression test for gh-3982. + p = stats.exponpow.logpdf(0, 1) + assert_equal(p, 0.0) + + # Check pdf and logpdf at x = 0 for other values of b. + p = stats.exponpow.pdf(0, [0.25, 1.0, 1.5]) + assert_equal(p, [np.inf, 1.0, 0.0]) + p = stats.exponpow.logpdf(0, [0.25, 1.0, 1.5]) + assert_equal(p, [np.inf, 0.0, -np.inf]) + + +def test_gengamma_edge(): + # Regression test for gh-3985. + p = stats.gengamma.pdf(0, 1, 1) + assert_equal(p, 1.0) + + # Regression tests for gh-4724. + p = stats.gengamma._munp(-2, 200, 1.) + assert_almost_equal(p, 1./199/198) + + p = stats.gengamma._munp(-2, 10, 1.) + assert_almost_equal(p, 1./9/8) + + def test_ksone_fit_freeze(): # Regression test for ticket #1638. d = np.array( @@ -1822,6 +2201,16 @@ def test_ncx2_tails_ticket_955(): assert_allclose(a, b, rtol=1e-3, atol=0) +def test_ncx2_tails_pdf(): + # ncx2.pdf does not return nans in extreme tails(example from gh-1577) + # NB: this is to check that nan_to_num is not needed in ncx2.pdf + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + assert_equal(stats.ncx2.pdf(1, np.arange(340, 350), 2), 0) + # logval = stats.ncx2.logpdf(1, np.arange(340, 350), 2) + # assert_(np.isneginf(logval).all()) + + def test_foldnorm_zero(): # Parameter value c=0 was not enabled, see gh-2399. rv = stats.foldnorm(0, scale=1) @@ -1836,7 +2225,8 @@ def test_stats_shapes_argcheck(): mv2_augmented = tuple(np.r_[np.nan, _] for _ in mv2) assert_equal(mv2_augmented, mv3) - mv3 = stats.lognorm.stats([2, 2.4, -1]) # -1 is not a legal shape parameter + # -1 is not a legal shape parameter + mv3 = stats.lognorm.stats([2, 2.4, -1]) mv2 = stats.lognorm.stats([2, 2.4]) mv2_augmented = tuple(np.r_[_, np.nan] for _ in mv2) assert_equal(mv2_augmented, mv3) @@ -1846,7 +2236,7 @@ def test_stats_shapes_argcheck(): # anyway, so some distributions may or may not fail. -## Test subclassing distributions w/ explicit shapes +# Test subclassing distributions w/ explicit shapes class _distr_gen(stats.rv_continuous): def _pdf(self, x, a): @@ -1978,8 +2368,9 @@ class TestSubclassingExplicitShapes(TestCase): # this is a limitation of the framework (_pdf(x, *goodargs)) class _distr_gen(stats.rv_continuous): def _pdf(self, x, *args, **kwargs): - # _pdf should handle *args, **kwargs itself. Here "handling" is - # ignoring *args and looking for ``extra_kwarg`` and using that. + # _pdf should handle *args, **kwargs itself. Here "handling" + # is ignoring *args and looking for ``extra_kwarg`` and using + # that. extra_kwarg = kwargs.pop('extra_kwarg', 1) return stats.norm._pdf(x) * extra_kwarg @@ -2075,5 +2466,52 @@ def test_infinite_input(): assert_almost_equal(stats.ncx2._cdf(np.inf, 8, 0.1), 1) +def test_lomax_accuracy(): + # regression test for gh-4033 + p = stats.lomax.ppf(stats.lomax.cdf(1e-100, 1), 1) + assert_allclose(p, 1e-100) + + +def test_gompertz_accuracy(): + # Regression test for gh-4031 + p = stats.gompertz.ppf(stats.gompertz.cdf(1e-100, 1), 1) + assert_allclose(p, 1e-100) + + +def test_truncexpon_accuracy(): + # regression test for gh-4035 + p = stats.truncexpon.ppf(stats.truncexpon.cdf(1e-100, 1), 1) + assert_allclose(p, 1e-100) + + +def test_rayleigh_accuracy(): + # regression test for gh-4034 + p = stats.rayleigh.isf(stats.rayleigh.sf(9, 1), 1) + assert_almost_equal(p, 9.0, decimal=15) + + +def test_genextreme_entropy(): + # regression test for gh-5181 + euler_gamma = 0.5772156649015329 + + h = stats.genextreme.entropy(-1.0) + assert_allclose(h, 2*euler_gamma + 1, rtol=1e-14) + + h = stats.genextreme.entropy(0) + assert_allclose(h, euler_gamma + 1, rtol=1e-14) + + h = stats.genextreme.entropy(1.0) + assert_equal(h, 1) + + h = stats.genextreme.entropy(-2.0, scale=10) + assert_allclose(h, euler_gamma*3 + np.log(10) + 1, rtol=1e-14) + + h = stats.genextreme.entropy(10) + assert_allclose(h, -9*euler_gamma + 1, rtol=1e-14) + + h = stats.genextreme.entropy(-10) + assert_allclose(h, 11*euler_gamma + 1, rtol=1e-14) + + if __name__ == "__main__": run_module_suite() diff --git a/wafo/stats/tests/test_fit.py b/wafo/stats/tests/test_fit.py index 45fd567..c304563 100644 --- a/wafo/stats/tests/test_fit.py +++ b/wafo/stats/tests/test_fit.py @@ -3,7 +3,7 @@ from __future__ import division, print_function, absolute_import import os import numpy as np -from numpy.testing import dec +from numpy.testing import dec, assert_allclose from wafo import stats @@ -13,12 +13,12 @@ from wafo.stats.tests.test_continuous_basic import distcont # verifies that the estimate and true values don't differ by too much fit_sizes = [1000, 5000] # sample sizes to try + thresh_percent = 0.25 # percent of true parameters for fail cut-off thresh_min = 0.75 # minimum difference estimate - true to fail test failing_fits = [ 'burr', - 'chi', 'chi2', 'gausshyper', 'genexpon', @@ -107,5 +107,18 @@ def check_cont_fit(distname,arg): raise AssertionError('fit not very good in %s\n' % distfn.name + txt) +def _check_loc_scale_mle_fit(name, data, desired, atol=None): + d = getattr(stats, name) + actual = d.fit(data)[-2:] + assert_allclose(actual, desired, atol=atol, + err_msg='poor mle fit of (loc, scale) in %s' % name) + + +def test_non_default_loc_scale_mle_fit(): + data = np.array([1.01, 1.78, 1.78, 1.78, 1.88, 1.88, 1.88, 2.00]) + yield _check_loc_scale_mle_fit, 'uniform', data, [1.01, 0.99], 1e-3 + yield _check_loc_scale_mle_fit, 'expon', data, [1.01, 0.73875], 1e-3 + + if __name__ == "__main__": np.testing.run_module_suite() diff --git a/wafo/stats/tests/test_kdeoth.py b/wafo/stats/tests/test_kdeoth.py deleted file mode 100644 index 9b2941d..0000000 --- a/wafo/stats/tests/test_kdeoth.py +++ /dev/null @@ -1,202 +0,0 @@ -from __future__ import division, print_function, absolute_import - -from wafo import stats -import numpy as np -from numpy.testing import assert_almost_equal, assert_, assert_raises, \ - assert_array_almost_equal, assert_array_almost_equal_nulp, run_module_suite - - -def test_kde_1d(): - #some basic tests comparing to normal distribution - np.random.seed(8765678) - n_basesample = 500 - xn = np.random.randn(n_basesample) - xnmean = xn.mean() - xnstd = xn.std(ddof=1) - - # get kde for original sample - gkde = stats.gaussian_kde(xn) - - # evaluate the density function for the kde for some points - xs = np.linspace(-7,7,501) - kdepdf = gkde.evaluate(xs) - normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd) - intervall = xs[1] - xs[0] - - assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01) - prob1 = gkde.integrate_box_1d(xnmean, np.inf) - prob2 = gkde.integrate_box_1d(-np.inf, xnmean) - assert_almost_equal(prob1, 0.5, decimal=1) - assert_almost_equal(prob2, 0.5, decimal=1) - assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13) - assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13) - - assert_almost_equal(gkde.integrate_kde(gkde), - (kdepdf**2).sum()*intervall, decimal=2) - assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2), - (kdepdf*normpdf).sum()*intervall, decimal=2) - - -def test_kde_bandwidth_method(): - def scotts_factor(kde_obj): - """Same as default, just check that it works.""" - return np.power(kde_obj.n, -1./(kde_obj.d+4)) - - np.random.seed(8765678) - n_basesample = 50 - xn = np.random.randn(n_basesample) - - # Default - gkde = stats.gaussian_kde(xn) - # Supply a callable - gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor) - # Supply a scalar - gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor) - - xs = np.linspace(-7,7,51) - kdepdf = gkde.evaluate(xs) - kdepdf2 = gkde2.evaluate(xs) - assert_almost_equal(kdepdf, kdepdf2) - kdepdf3 = gkde3.evaluate(xs) - assert_almost_equal(kdepdf, kdepdf3) - - assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring') - - -# Subclasses that should stay working (extracted from various sources). -# Unfortunately the earlier design of gaussian_kde made it necessary for users -# to create these kinds of subclasses, or call _compute_covariance() directly. - -class _kde_subclass1(stats.gaussian_kde): - def __init__(self, dataset): - self.dataset = np.atleast_2d(dataset) - self.d, self.n = self.dataset.shape - self.covariance_factor = self.scotts_factor - self._compute_covariance() - - -class _kde_subclass2(stats.gaussian_kde): - def __init__(self, dataset): - self.covariance_factor = self.scotts_factor - super(_kde_subclass2, self).__init__(dataset) - - -class _kde_subclass3(stats.gaussian_kde): - def __init__(self, dataset, covariance): - self.covariance = covariance - stats.gaussian_kde.__init__(self, dataset) - - def _compute_covariance(self): - self.inv_cov = np.linalg.inv(self.covariance) - self._norm_factor = np.sqrt(np.linalg.det(2*np.pi * self.covariance)) \ - * self.n - - -class _kde_subclass4(stats.gaussian_kde): - def covariance_factor(self): - return 0.5 * self.silverman_factor() - - -def test_gaussian_kde_subclassing(): - x1 = np.array([-7, -5, 1, 4, 5], dtype=np.float) - xs = np.linspace(-10, 10, num=50) - - # gaussian_kde itself - kde = stats.gaussian_kde(x1) - ys = kde(xs) - - # subclass 1 - kde1 = _kde_subclass1(x1) - y1 = kde1(xs) - assert_array_almost_equal_nulp(ys, y1, nulp=10) - - # subclass 2 - kde2 = _kde_subclass2(x1) - y2 = kde2(xs) - assert_array_almost_equal_nulp(ys, y2, nulp=10) - - # subclass 3 - kde3 = _kde_subclass3(x1, kde.covariance) - y3 = kde3(xs) - assert_array_almost_equal_nulp(ys, y3, nulp=10) - - # subclass 4 - kde4 = _kde_subclass4(x1) - y4 = kde4(x1) - y_expected = [0.06292987, 0.06346938, 0.05860291, 0.08657652, 0.07904017] - - assert_array_almost_equal(y_expected, y4, decimal=6) - - # Not a subclass, but check for use of _compute_covariance() - kde5 = kde - kde5.covariance_factor = lambda: kde.factor - kde5._compute_covariance() - y5 = kde5(xs) - assert_array_almost_equal_nulp(ys, y5, nulp=10) - - -def test_gaussian_kde_covariance_caching(): - x1 = np.array([-7, -5, 1, 4, 5], dtype=np.float) - xs = np.linspace(-10, 10, num=5) - # These expected values are from scipy 0.10, before some changes to - # gaussian_kde. They were not compared with any external reference. - y_expected = [0.02463386, 0.04689208, 0.05395444, 0.05337754, 0.01664475] - - # Set the bandwidth, then reset it to the default. - kde = stats.gaussian_kde(x1) - kde.set_bandwidth(bw_method=0.5) - kde.set_bandwidth(bw_method='scott') - y2 = kde(xs) - - assert_array_almost_equal(y_expected, y2, decimal=7) - - -def test_gaussian_kde_monkeypatch(): - """Ugly, but people may rely on this. See scipy pull request 123, - specifically the linked ML thread "Width of the Gaussian in stats.kde". - If it is necessary to break this later on, that is to be discussed on ML. - """ - x1 = np.array([-7, -5, 1, 4, 5], dtype=np.float) - xs = np.linspace(-10, 10, num=50) - - # The old monkeypatched version to get at Silverman's Rule. - kde = stats.gaussian_kde(x1) - kde.covariance_factor = kde.silverman_factor - kde._compute_covariance() - y1 = kde(xs) - - # The new saner version. - kde2 = stats.gaussian_kde(x1, bw_method='silverman') - y2 = kde2(xs) - - assert_array_almost_equal_nulp(y1, y2, nulp=10) - - -def test_kde_integer_input(): - """Regression test for #1181.""" - x1 = np.arange(5) - kde = stats.gaussian_kde(x1) - y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869, 0.13480721] - assert_array_almost_equal(kde(x1), y_expected, decimal=6) - - -def test_pdf_logpdf(): - np.random.seed(1) - n_basesample = 50 - xn = np.random.randn(n_basesample) - - # Default - gkde = stats.gaussian_kde(xn) - - xs = np.linspace(-15, 12, 25) - pdf = gkde.evaluate(xs) - pdf2 = gkde.pdf(xs) - assert_almost_equal(pdf, pdf2, decimal=12) - - logpdf = np.log(pdf) - logpdf2 = gkde.logpdf(xs) - assert_almost_equal(logpdf, logpdf2, decimal=12) - - -if __name__ == "__main__": - run_module_suite() diff --git a/wafo/stats/tests/test_morestats.py b/wafo/stats/tests/test_morestats.py deleted file mode 100644 index 3813d57..0000000 --- a/wafo/stats/tests/test_morestats.py +++ /dev/null @@ -1,1009 +0,0 @@ -# Author: Travis Oliphant, 2002 -# -# Further enhancements and tests added by numerous SciPy developers. -# -from __future__ import division, print_function, absolute_import - -import warnings - -import numpy as np -from numpy.random import RandomState -from numpy.testing import (TestCase, run_module_suite, assert_array_equal, - assert_almost_equal, assert_array_less, assert_array_almost_equal, - assert_raises, assert_, assert_allclose, assert_equal, dec, assert_warns) - -from wafo import stats - -# Matplotlib is not a scipy dependency but is optionally used in probplot, so -# check if it's available -try: - import matplotlib.pyplot as plt - have_matplotlib = True -except: - have_matplotlib = False - - -g1 = [1.006, 0.996, 0.998, 1.000, 0.992, 0.993, 1.002, 0.999, 0.994, 1.000] -g2 = [0.998, 1.006, 1.000, 1.002, 0.997, 0.998, 0.996, 1.000, 1.006, 0.988] -g3 = [0.991, 0.987, 0.997, 0.999, 0.995, 0.994, 1.000, 0.999, 0.996, 0.996] -g4 = [1.005, 1.002, 0.994, 1.000, 0.995, 0.994, 0.998, 0.996, 1.002, 0.996] -g5 = [0.998, 0.998, 0.982, 0.990, 1.002, 0.984, 0.996, 0.993, 0.980, 0.996] -g6 = [1.009, 1.013, 1.009, 0.997, 0.988, 1.002, 0.995, 0.998, 0.981, 0.996] -g7 = [0.990, 1.004, 0.996, 1.001, 0.998, 1.000, 1.018, 1.010, 0.996, 1.002] -g8 = [0.998, 1.000, 1.006, 1.000, 1.002, 0.996, 0.998, 0.996, 1.002, 1.006] -g9 = [1.002, 0.998, 0.996, 0.995, 0.996, 1.004, 1.004, 0.998, 0.999, 0.991] -g10 = [0.991, 0.995, 0.984, 0.994, 0.997, 0.997, 0.991, 0.998, 1.004, 0.997] - - -class TestShapiro(TestCase): - def test_basic(self): - x1 = [0.11,7.87,4.61,10.14,7.95,3.14,0.46, - 4.43,0.21,4.75,0.71,1.52,3.24, - 0.93,0.42,4.97,9.53,4.55,0.47,6.66] - w,pw = stats.shapiro(x1) - assert_almost_equal(w,0.90047299861907959,6) - assert_almost_equal(pw,0.042089745402336121,6) - x2 = [1.36,1.14,2.92,2.55,1.46,1.06,5.27,-1.11, - 3.48,1.10,0.88,-0.51,1.46,0.52,6.20,1.69, - 0.08,3.67,2.81,3.49] - w,pw = stats.shapiro(x2) - assert_almost_equal(w,0.9590270,6) - assert_almost_equal(pw,0.52460,3) - - def test_bad_arg(self): - # Length of x is less than 3. - x = [1] - assert_raises(ValueError, stats.shapiro, x) - - -class TestAnderson(TestCase): - def test_normal(self): - rs = RandomState(1234567890) - x1 = rs.standard_exponential(size=50) - x2 = rs.standard_normal(size=50) - A,crit,sig = stats.anderson(x1) - assert_array_less(crit[:-1], A) - A,crit,sig = stats.anderson(x2) - assert_array_less(A, crit[-2:]) - - def test_expon(self): - rs = RandomState(1234567890) - x1 = rs.standard_exponential(size=50) - x2 = rs.standard_normal(size=50) - A,crit,sig = stats.anderson(x1,'expon') - assert_array_less(A, crit[-2:]) - olderr = np.seterr(all='ignore') - try: - A,crit,sig = stats.anderson(x2,'expon') - finally: - np.seterr(**olderr) - assert_(A > crit[-1]) - - def test_bad_arg(self): - assert_raises(ValueError, stats.anderson, [1], dist='plate_of_shrimp') - - -class TestAndersonKSamp(TestCase): - def test_example1a(self): - # Example data from Scholz & Stephens (1987), originally - # published in Lehmann (1995, Nonparametrics, Statistical - # Methods Based on Ranks, p. 309) - # Pass a mixture of lists and arrays - t1 = [38.7, 41.5, 43.8, 44.5, 45.5, 46.0, 47.7, 58.0] - t2 = np.array([39.2, 39.3, 39.7, 41.4, 41.8, 42.9, 43.3, 45.8]) - t3 = np.array([34.0, 35.0, 39.0, 40.0, 43.0, 43.0, 44.0, 45.0]) - t4 = np.array([34.0, 34.8, 34.8, 35.4, 37.2, 37.8, 41.2, 42.8]) - assert_warns(UserWarning, stats.anderson_ksamp, (t1, t2, t3, t4), - midrank=False) - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', message='approximate p-value') - Tk, tm, p = stats.anderson_ksamp((t1, t2, t3, t4), midrank=False) - - assert_almost_equal(Tk, 4.449, 3) - assert_array_almost_equal([0.4985, 1.3237, 1.9158, 2.4930, 3.2459], - tm, 4) - assert_almost_equal(p, 0.0021, 4) - - def test_example1b(self): - # Example data from Scholz & Stephens (1987), originally - # published in Lehmann (1995, Nonparametrics, Statistical - # Methods Based on Ranks, p. 309) - # Pass arrays - t1 = np.array([38.7, 41.5, 43.8, 44.5, 45.5, 46.0, 47.7, 58.0]) - t2 = np.array([39.2, 39.3, 39.7, 41.4, 41.8, 42.9, 43.3, 45.8]) - t3 = np.array([34.0, 35.0, 39.0, 40.0, 43.0, 43.0, 44.0, 45.0]) - t4 = np.array([34.0, 34.8, 34.8, 35.4, 37.2, 37.8, 41.2, 42.8]) - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', message='approximate p-value') - Tk, tm, p = stats.anderson_ksamp((t1, t2, t3, t4), midrank=True) - - assert_almost_equal(Tk, 4.480, 3) - assert_array_almost_equal([0.4985, 1.3237, 1.9158, 2.4930, 3.2459], - tm, 4) - assert_almost_equal(p, 0.0020, 4) - - def test_example2a(self): - # Example data taken from an earlier technical report of - # Scholz and Stephens - # Pass lists instead of arrays - t1 = [194, 15, 41, 29, 33, 181] - t2 = [413, 14, 58, 37, 100, 65, 9, 169, 447, 184, 36, 201, 118] - t3 = [34, 31, 18, 18, 67, 57, 62, 7, 22, 34] - t4 = [90, 10, 60, 186, 61, 49, 14, 24, 56, 20, 79, 84, 44, 59, 29, - 118, 25, 156, 310, 76, 26, 44, 23, 62] - t5 = [130, 208, 70, 101, 208] - t6 = [74, 57, 48, 29, 502, 12, 70, 21, 29, 386, 59, 27] - t7 = [55, 320, 56, 104, 220, 239, 47, 246, 176, 182, 33] - t8 = [23, 261, 87, 7, 120, 14, 62, 47, 225, 71, 246, 21, 42, 20, 5, - 12, 120, 11, 3, 14, 71, 11, 14, 11, 16, 90, 1, 16, 52, 95] - t9 = [97, 51, 11, 4, 141, 18, 142, 68, 77, 80, 1, 16, 106, 206, 82, - 54, 31, 216, 46, 111, 39, 63, 18, 191, 18, 163, 24] - t10 = [50, 44, 102, 72, 22, 39, 3, 15, 197, 188, 79, 88, 46, 5, 5, 36, - 22, 139, 210, 97, 30, 23, 13, 14] - t11 = [359, 9, 12, 270, 603, 3, 104, 2, 438] - t12 = [50, 254, 5, 283, 35, 12] - t13 = [487, 18, 100, 7, 98, 5, 85, 91, 43, 230, 3, 130] - t14 = [102, 209, 14, 57, 54, 32, 67, 59, 134, 152, 27, 14, 230, 66, - 61, 34] - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', message='approximate p-value') - Tk, tm, p = stats.anderson_ksamp((t1, t2, t3, t4, t5, t6, t7, t8, - t9, t10, t11, t12, t13, t14), - midrank=False) - - assert_almost_equal(Tk, 3.288, 3) - assert_array_almost_equal([0.5990, 1.3269, 1.8052, 2.2486, 2.8009], - tm, 4) - assert_almost_equal(p, 0.0041, 4) - - def test_example2b(self): - # Example data taken from an earlier technical report of - # Scholz and Stephens - t1 = [194, 15, 41, 29, 33, 181] - t2 = [413, 14, 58, 37, 100, 65, 9, 169, 447, 184, 36, 201, 118] - t3 = [34, 31, 18, 18, 67, 57, 62, 7, 22, 34] - t4 = [90, 10, 60, 186, 61, 49, 14, 24, 56, 20, 79, 84, 44, 59, 29, - 118, 25, 156, 310, 76, 26, 44, 23, 62] - t5 = [130, 208, 70, 101, 208] - t6 = [74, 57, 48, 29, 502, 12, 70, 21, 29, 386, 59, 27] - t7 = [55, 320, 56, 104, 220, 239, 47, 246, 176, 182, 33] - t8 = [23, 261, 87, 7, 120, 14, 62, 47, 225, 71, 246, 21, 42, 20, 5, - 12, 120, 11, 3, 14, 71, 11, 14, 11, 16, 90, 1, 16, 52, 95] - t9 = [97, 51, 11, 4, 141, 18, 142, 68, 77, 80, 1, 16, 106, 206, 82, - 54, 31, 216, 46, 111, 39, 63, 18, 191, 18, 163, 24] - t10 = [50, 44, 102, 72, 22, 39, 3, 15, 197, 188, 79, 88, 46, 5, 5, 36, - 22, 139, 210, 97, 30, 23, 13, 14] - t11 = [359, 9, 12, 270, 603, 3, 104, 2, 438] - t12 = [50, 254, 5, 283, 35, 12] - t13 = [487, 18, 100, 7, 98, 5, 85, 91, 43, 230, 3, 130] - t14 = [102, 209, 14, 57, 54, 32, 67, 59, 134, 152, 27, 14, 230, 66, - 61, 34] - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', message='approximate p-value') - Tk, tm, p = stats.anderson_ksamp((t1, t2, t3, t4, t5, t6, t7, t8, - t9, t10, t11, t12, t13, t14), - midrank=True) - - assert_almost_equal(Tk, 3.294, 3) - assert_array_almost_equal([0.5990, 1.3269, 1.8052, 2.2486, 2.8009], - tm, 4) - assert_almost_equal(p, 0.0041, 4) - - def test_not_enough_samples(self): - assert_raises(ValueError, stats.anderson_ksamp, np.ones(5)) - - def test_no_distinct_observations(self): - assert_raises(ValueError, stats.anderson_ksamp, - (np.ones(5), np.ones(5))) - - def test_empty_sample(self): - assert_raises(ValueError, stats.anderson_ksamp, (np.ones(5), [])) - - -class TestAnsari(TestCase): - - def test_small(self): - x = [1,2,3,3,4] - y = [3,2,6,1,6,1,4,1] - W, pval = stats.ansari(x,y) - assert_almost_equal(W,23.5,11) - assert_almost_equal(pval,0.13499256881897437,11) - - def test_approx(self): - ramsay = np.array((111, 107, 100, 99, 102, 106, 109, 108, 104, 99, - 101, 96, 97, 102, 107, 113, 116, 113, 110, 98)) - parekh = np.array((107, 108, 106, 98, 105, 103, 110, 105, 104, - 100, 96, 108, 103, 104, 114, 114, 113, 108, 106, 99)) - - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', - message="Ties preclude use of exact statistic.") - W, pval = stats.ansari(ramsay, parekh) - - assert_almost_equal(W,185.5,11) - assert_almost_equal(pval,0.18145819972867083,11) - - def test_exact(self): - W,pval = stats.ansari([1,2,3,4],[15,5,20,8,10,12]) - assert_almost_equal(W,10.0,11) - assert_almost_equal(pval,0.533333333333333333,7) - - def test_bad_arg(self): - assert_raises(ValueError, stats.ansari, [], [1]) - assert_raises(ValueError, stats.ansari, [1], []) - - -class TestBartlett(TestCase): - - def test_data(self): - args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10] - T, pval = stats.bartlett(*args) - assert_almost_equal(T,20.78587342806484,7) - assert_almost_equal(pval,0.0136358632781,7) - - def test_bad_arg(self): - # Too few args raises ValueError. - assert_raises(ValueError, stats.bartlett, [1]) - - -class TestLevene(TestCase): - - def test_data(self): - args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10] - W, pval = stats.levene(*args) - assert_almost_equal(W,1.7059176930008939,7) - assert_almost_equal(pval,0.0990829755522,7) - - def test_trimmed1(self): - # Test that center='trimmed' gives the same result as center='mean' - # when proportiontocut=0. - W1, pval1 = stats.levene(g1, g2, g3, center='mean') - W2, pval2 = stats.levene(g1, g2, g3, center='trimmed', proportiontocut=0.0) - assert_almost_equal(W1, W2) - assert_almost_equal(pval1, pval2) - - def test_trimmed2(self): - x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0] - y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0] - np.random.seed(1234) - x2 = np.random.permutation(x) - - # Use center='trimmed' - W0, pval0 = stats.levene(x, y, center='trimmed', proportiontocut=0.125) - W1, pval1 = stats.levene(x2, y, center='trimmed', proportiontocut=0.125) - # Trim the data here, and use center='mean' - W2, pval2 = stats.levene(x[1:-1], y[1:-1], center='mean') - # Result should be the same. - assert_almost_equal(W0, W2) - assert_almost_equal(W1, W2) - assert_almost_equal(pval1, pval2) - - def test_equal_mean_median(self): - x = np.linspace(-1,1,21) - np.random.seed(1234) - x2 = np.random.permutation(x) - y = x**3 - W1, pval1 = stats.levene(x, y, center='mean') - W2, pval2 = stats.levene(x2, y, center='median') - assert_almost_equal(W1, W2) - assert_almost_equal(pval1, pval2) - - def test_bad_keyword(self): - x = np.linspace(-1,1,21) - assert_raises(TypeError, stats.levene, x, x, portiontocut=0.1) - - def test_bad_center_value(self): - x = np.linspace(-1,1,21) - assert_raises(ValueError, stats.levene, x, x, center='trim') - - def test_too_few_args(self): - assert_raises(ValueError, stats.levene, [1]) - - -class TestBinomP(TestCase): - - def test_data(self): - pval = stats.binom_test(100,250) - assert_almost_equal(pval,0.0018833009350757682,11) - pval = stats.binom_test(201,405) - assert_almost_equal(pval,0.92085205962670713,11) - pval = stats.binom_test([682,243],p=3.0/4) - assert_almost_equal(pval,0.38249155957481695,11) - - def test_bad_len_x(self): - # Length of x must be 1 or 2. - assert_raises(ValueError, stats.binom_test, [1,2,3]) - - def test_bad_n(self): - # len(x) is 1, but n is invalid. - # Missing n - assert_raises(ValueError, stats.binom_test, [100]) - # n less than x[0] - assert_raises(ValueError, stats.binom_test, [100], n=50) - - def test_bad_p(self): - assert_raises(ValueError, stats.binom_test, [50, 50], p=2.0) - - -class TestFindRepeats(TestCase): - - def test_basic(self): - a = [1,2,3,4,1,2,3,4,1,2,5] - res,nums = stats.find_repeats(a) - assert_array_equal(res,[1,2,3,4]) - assert_array_equal(nums,[3,3,2,2]) - - def test_empty_result(self): - # Check that empty arrays are returned when there are no repeats. - a = [10, 20, 50, 30, 40] - repeated, counts = stats.find_repeats(a) - assert_array_equal(repeated, []) - assert_array_equal(counts, []) - - -class TestFligner(TestCase): - - def test_data(self): - # numbers from R: fligner.test in package stats - x1 = np.arange(5) - assert_array_almost_equal(stats.fligner(x1,x1**2), - (3.2282229927203536, 0.072379187848207877), 11) - - def test_trimmed1(self): - # Test that center='trimmed' gives the same result as center='mean' - # when proportiontocut=0. - Xsq1, pval1 = stats.fligner(g1, g2, g3, center='mean') - Xsq2, pval2 = stats.fligner(g1, g2, g3, center='trimmed', proportiontocut=0.0) - assert_almost_equal(Xsq1, Xsq2) - assert_almost_equal(pval1, pval2) - - def test_trimmed2(self): - x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0] - y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0] - # Use center='trimmed' - Xsq1, pval1 = stats.fligner(x, y, center='trimmed', proportiontocut=0.125) - # Trim the data here, and use center='mean' - Xsq2, pval2 = stats.fligner(x[1:-1], y[1:-1], center='mean') - # Result should be the same. - assert_almost_equal(Xsq1, Xsq2) - assert_almost_equal(pval1, pval2) - - # The following test looks reasonable at first, but fligner() uses the - # function stats.rankdata(), and in one of the cases in this test, - # there are ties, while in the other (because of normal rounding - # errors) there are not. This difference leads to differences in the - # third significant digit of W. - # - #def test_equal_mean_median(self): - # x = np.linspace(-1,1,21) - # y = x**3 - # W1, pval1 = stats.fligner(x, y, center='mean') - # W2, pval2 = stats.fligner(x, y, center='median') - # assert_almost_equal(W1, W2) - # assert_almost_equal(pval1, pval2) - - def test_bad_keyword(self): - x = np.linspace(-1,1,21) - assert_raises(TypeError, stats.fligner, x, x, portiontocut=0.1) - - def test_bad_center_value(self): - x = np.linspace(-1,1,21) - assert_raises(ValueError, stats.fligner, x, x, center='trim') - - def test_bad_num_args(self): - # Too few args raises ValueError. - assert_raises(ValueError, stats.fligner, [1]) - - -class TestMood(TestCase): - def test_mood(self): - # numbers from R: mood.test in package stats - x1 = np.arange(5) - assert_array_almost_equal(stats.mood(x1, x1**2), - (-1.3830857299399906, 0.16663858066771478), 11) - - def test_mood_order_of_args(self): - # z should change sign when the order of arguments changes, pvalue - # should not change - np.random.seed(1234) - x1 = np.random.randn(10, 1) - x2 = np.random.randn(15, 1) - z1, p1 = stats.mood(x1, x2) - z2, p2 = stats.mood(x2, x1) - assert_array_almost_equal([z1, p1], [-z2, p2]) - - def test_mood_with_axis_none(self): - #Test with axis = None, compare with results from R - x1 = [-0.626453810742332, 0.183643324222082, -0.835628612410047, - 1.59528080213779, 0.329507771815361, -0.820468384118015, - 0.487429052428485, 0.738324705129217, 0.575781351653492, - -0.305388387156356, 1.51178116845085, 0.389843236411431, - -0.621240580541804, -2.2146998871775, 1.12493091814311, - -0.0449336090152309, -0.0161902630989461, 0.943836210685299, - 0.821221195098089, 0.593901321217509] - - x2 = [-0.896914546624981, 0.184849184646742, 1.58784533120882, - -1.13037567424629, -0.0802517565509893, 0.132420284381094, - 0.707954729271733, -0.23969802417184, 1.98447393665293, - -0.138787012119665, 0.417650750792556, 0.981752777463662, - -0.392695355503813, -1.03966897694891, 1.78222896030858, - -2.31106908460517, 0.878604580921265, 0.035806718015226, - 1.01282869212708, 0.432265154539617, 2.09081920524915, - -1.19992581964387, 1.58963820029007, 1.95465164222325, - 0.00493777682814261, -2.45170638784613, 0.477237302613617, - -0.596558168631403, 0.792203270299649, 0.289636710177348] - - x1 = np.array(x1) - x2 = np.array(x2) - x1.shape = (10, 2) - x2.shape = (15, 2) - assert_array_almost_equal(stats.mood(x1, x2, axis=None), - [-1.31716607555, 0.18778296257]) - - def test_mood_2d(self): - # Test if the results of mood test in 2-D case are consistent with the - # R result for the same inputs. Numbers from R mood.test(). - ny = 5 - np.random.seed(1234) - x1 = np.random.randn(10, ny) - x2 = np.random.randn(15, ny) - z_vectest, pval_vectest = stats.mood(x1, x2) - - for j in range(ny): - assert_array_almost_equal([z_vectest[j], pval_vectest[j]], - stats.mood(x1[:, j], x2[:, j])) - - # inverse order of dimensions - x1 = x1.transpose() - x2 = x2.transpose() - z_vectest, pval_vectest = stats.mood(x1, x2, axis=1) - - for i in range(ny): - # check axis handling is self consistent - assert_array_almost_equal([z_vectest[i], pval_vectest[i]], - stats.mood(x1[i, :], x2[i, :])) - - def test_mood_3d(self): - shape = (10, 5, 6) - np.random.seed(1234) - x1 = np.random.randn(*shape) - x2 = np.random.randn(*shape) - - for axis in range(3): - z_vectest, pval_vectest = stats.mood(x1, x2, axis=axis) - # Tests that result for 3-D arrays is equal to that for the - # same calculation on a set of 1-D arrays taken from the - # 3-D array - axes_idx = ([1, 2], [0, 2], [0, 1]) # the two axes != axis - for i in range(shape[axes_idx[axis][0]]): - for j in range(shape[axes_idx[axis][1]]): - if axis == 0: - slice1 = x1[:, i, j] - slice2 = x2[:, i, j] - elif axis == 1: - slice1 = x1[i, :, j] - slice2 = x2[i, :, j] - else: - slice1 = x1[i, j, :] - slice2 = x2[i, j, :] - - assert_array_almost_equal([z_vectest[i, j], - pval_vectest[i, j]], - stats.mood(slice1, slice2)) - - def test_mood_bad_arg(self): - # Raise ValueError when the sum of the lengths of the args is less than 3 - assert_raises(ValueError, stats.mood, [1], []) - - -class TestProbplot(TestCase): - - def test_basic(self): - np.random.seed(12345) - x = stats.norm.rvs(size=20) - osm, osr = stats.probplot(x, fit=False) - osm_expected = [-1.8241636, -1.38768012, -1.11829229, -0.91222575, - -0.73908135, -0.5857176, -0.44506467, -0.31273668, - -0.18568928, -0.06158146, 0.06158146, 0.18568928, - 0.31273668, 0.44506467, 0.5857176, 0.73908135, - 0.91222575, 1.11829229, 1.38768012, 1.8241636] - assert_allclose(osr, np.sort(x)) - assert_allclose(osm, osm_expected) - - res, res_fit = stats.probplot(x, fit=True) - res_fit_expected = [1.05361841, 0.31297795, 0.98741609] - assert_allclose(res_fit, res_fit_expected) - - def test_sparams_keyword(self): - np.random.seed(123456) - x = stats.norm.rvs(size=100) - # Check that None, () and 0 (loc=0, for normal distribution) all work - # and give the same results - osm1, osr1 = stats.probplot(x, sparams=None, fit=False) - osm2, osr2 = stats.probplot(x, sparams=0, fit=False) - osm3, osr3 = stats.probplot(x, sparams=(), fit=False) - assert_allclose(osm1, osm2) - assert_allclose(osm1, osm3) - assert_allclose(osr1, osr2) - assert_allclose(osr1, osr3) - # Check giving (loc, scale) params for normal distribution - osm, osr = stats.probplot(x, sparams=(), fit=False) - - def test_dist_keyword(self): - np.random.seed(12345) - x = stats.norm.rvs(size=20) - osm1, osr1 = stats.probplot(x, fit=False, dist='t', sparams=(3,)) - osm2, osr2 = stats.probplot(x, fit=False, dist=stats.t, sparams=(3,)) - assert_allclose(osm1, osm2) - assert_allclose(osr1, osr2) - - assert_raises(ValueError, stats.probplot, x, dist='wrong-dist-name') - assert_raises(AttributeError, stats.probplot, x, dist=[]) - - class custom_dist(object): - """Some class that looks just enough like a distribution.""" - def ppf(self, q): - return stats.norm.ppf(q, loc=2) - - osm1, osr1 = stats.probplot(x, sparams=(2,), fit=False) - osm2, osr2 = stats.probplot(x, dist=custom_dist(), fit=False) - assert_allclose(osm1, osm2) - assert_allclose(osr1, osr2) - - @dec.skipif(not have_matplotlib) - def test_plot_kwarg(self): - np.random.seed(7654321) - fig = plt.figure() - fig.add_subplot(111) - x = stats.t.rvs(3, size=100) - res1, fitres1 = stats.probplot(x, plot=plt) - plt.close() - res2, fitres2 = stats.probplot(x, plot=None) - res3 = stats.probplot(x, fit=False, plot=plt) - plt.close() - res4 = stats.probplot(x, fit=False, plot=None) - # Check that results are consistent between combinations of `fit` and - # `plot` keywords. - assert_(len(res1) == len(res2) == len(res3) == len(res4) == 2) - assert_allclose(res1, res2) - assert_allclose(res1, res3) - assert_allclose(res1, res4) - assert_allclose(fitres1, fitres2) - - # Check that a Matplotlib Axes object is accepted - fig = plt.figure() - ax = fig.add_subplot(111) - stats.probplot(x, fit=False, plot=ax) - plt.close() - - def test_probplot_bad_args(self): - # Raise ValueError when given an invalid distribution. - assert_raises(ValueError, stats.probplot, [1], dist="plate_of_shrimp") - - -def test_wilcoxon_bad_arg(): - # Raise ValueError when two args of different lengths are given or - # zero_method is unknown. - assert_raises(ValueError, stats.wilcoxon, [1], [1,2]) - assert_raises(ValueError, stats.wilcoxon, [1,2], [1,2], "dummy") - - -def test_mvsdist_bad_arg(): - # Raise ValueError if fewer than two data points are given. - data = [1] - assert_raises(ValueError, stats.mvsdist, data) - - -def test_kstat_bad_arg(): - # Raise ValueError if n > 4 or n > 1. - data = [1] - n = 10 - assert_raises(ValueError, stats.kstat, data, n=n) - - -def test_kstatvar_bad_arg(): - # Raise ValueError is n is not 1 or 2. - data = [1] - n = 10 - assert_raises(ValueError, stats.kstatvar, data, n=n) - - -def test_ppcc_max_bad_arg(): - # Raise ValueError when given an invalid distribution. - data = [1] - assert_raises(ValueError, stats.ppcc_max, data, dist="plate_of_shrimp") - - -class TestBoxcox_llf(TestCase): - - def test_basic(self): - np.random.seed(54321) - x = stats.norm.rvs(size=10000, loc=10) - lmbda = 1 - llf = stats.boxcox_llf(lmbda, x) - llf_expected = -x.size / 2. * np.log(np.sum(x.std()**2)) - assert_allclose(llf, llf_expected) - - def test_array_like(self): - np.random.seed(54321) - x = stats.norm.rvs(size=100, loc=10) - lmbda = 1 - llf = stats.boxcox_llf(lmbda, x) - llf2 = stats.boxcox_llf(lmbda, list(x)) - assert_allclose(llf, llf2, rtol=1e-12) - - def test_2d_input(self): - # Note: boxcox_llf() was already working with 2-D input (sort of), so - # keep it like that. boxcox() doesn't work with 2-D input though, due - # to brent() returning a scalar. - np.random.seed(54321) - x = stats.norm.rvs(size=100, loc=10) - lmbda = 1 - llf = stats.boxcox_llf(lmbda, x) - llf2 = stats.boxcox_llf(lmbda, np.vstack([x, x]).T) - assert_allclose([llf, llf], llf2, rtol=1e-12) - - def test_empty(self): - assert_(np.isnan(stats.boxcox_llf(1, []))) - - -class TestBoxcox(TestCase): - - def test_fixed_lmbda(self): - np.random.seed(12345) - x = stats.loggamma.rvs(5, size=50) + 5 - xt = stats.boxcox(x, lmbda=1) - assert_allclose(xt, x - 1) - xt = stats.boxcox(x, lmbda=-1) - assert_allclose(xt, 1 - 1/x) - - xt = stats.boxcox(x, lmbda=0) - assert_allclose(xt, np.log(x)) - - # Also test that array_like input works - xt = stats.boxcox(list(x), lmbda=0) - assert_allclose(xt, np.log(x)) - - def test_lmbda_None(self): - np.random.seed(1234567) - # Start from normal rv's, do inverse transform to check that - # optimization function gets close to the right answer. - np.random.seed(1245) - lmbda = 2.5 - x = stats.norm.rvs(loc=10, size=50000) - x_inv = (x * lmbda + 1)**(-lmbda) - xt, maxlog = stats.boxcox(x_inv) - - assert_almost_equal(maxlog, -1 / lmbda, decimal=2) - - def test_alpha(self): - np.random.seed(1234) - x = stats.loggamma.rvs(5, size=50) + 5 - - # Some regular values for alpha, on a small sample size - _, _, interval = stats.boxcox(x, alpha=0.75) - assert_allclose(interval, [4.004485780226041, 5.138756355035744]) - _, _, interval = stats.boxcox(x, alpha=0.05) - assert_allclose(interval, [1.2138178554857557, 8.209033272375663]) - - # Try some extreme values, see we don't hit the N=500 limit - x = stats.loggamma.rvs(7, size=500) + 15 - _, _, interval = stats.boxcox(x, alpha=0.001) - assert_allclose(interval, [0.3988867, 11.40553131]) - _, _, interval = stats.boxcox(x, alpha=0.999) - assert_allclose(interval, [5.83316246, 5.83735292]) - - def test_boxcox_bad_arg(self): - # Raise ValueError if any data value is negative. - x = np.array([-1]) - assert_raises(ValueError, stats.boxcox, x) - - def test_empty(self): - assert_(stats.boxcox([]).shape == (0,)) - - -class TestBoxcoxNormmax(TestCase): - def setUp(self): - np.random.seed(12345) - self.x = stats.loggamma.rvs(5, size=50) + 5 - - def test_pearsonr(self): - maxlog = stats.boxcox_normmax(self.x) - assert_allclose(maxlog, 1.804465, rtol=1e-6) - - def test_mle(self): - maxlog = stats.boxcox_normmax(self.x, method='mle') - assert_allclose(maxlog, 1.758101, rtol=1e-6) - - # Check that boxcox() uses 'mle' - _, maxlog_boxcox = stats.boxcox(self.x) - assert_allclose(maxlog_boxcox, maxlog) - - def test_all(self): - maxlog_all = stats.boxcox_normmax(self.x, method='all') - assert_allclose(maxlog_all, [1.804465, 1.758101], rtol=1e-6) - - -class TestBoxcoxNormplot(TestCase): - def setUp(self): - np.random.seed(7654321) - self.x = stats.loggamma.rvs(5, size=500) + 5 - - def test_basic(self): - N = 5 - lmbdas, ppcc = stats.boxcox_normplot(self.x, -10, 10, N=N) - ppcc_expected = [0.57783375, 0.83610988, 0.97524311, 0.99756057, - 0.95843297] - assert_allclose(lmbdas, np.linspace(-10, 10, num=N)) - assert_allclose(ppcc, ppcc_expected) - - @dec.skipif(not have_matplotlib) - def test_plot_kwarg(self): - # Check with the matplotlib.pyplot module - fig = plt.figure() - fig.add_subplot(111) - stats.boxcox_normplot(self.x, -20, 20, plot=plt) - plt.close() - - # Check that a Matplotlib Axes object is accepted - fig.add_subplot(111) - ax = fig.add_subplot(111) - stats.boxcox_normplot(self.x, -20, 20, plot=ax) - plt.close() - - def test_invalid_inputs(self): - # `lb` has to be larger than `la` - assert_raises(ValueError, stats.boxcox_normplot, self.x, 1, 0) - # `x` can not contain negative values - assert_raises(ValueError, stats.boxcox_normplot, [-1, 1], 0, 1) - - def test_empty(self): - assert_(stats.boxcox_normplot([], 0, 1).size == 0) - - -class TestCircFuncs(TestCase): - def test_circfuncs(self): - x = np.array([355,5,2,359,10,350]) - M = stats.circmean(x, high=360) - Mval = 0.167690146 - assert_allclose(M, Mval, rtol=1e-7) - - V = stats.circvar(x, high=360) - Vval = 42.51955609 - assert_allclose(V, Vval, rtol=1e-7) - - S = stats.circstd(x, high=360) - Sval = 6.520702116 - assert_allclose(S, Sval, rtol=1e-7) - - def test_circfuncs_small(self): - x = np.array([20,21,22,18,19,20.5,19.2]) - M1 = x.mean() - M2 = stats.circmean(x, high=360) - assert_allclose(M2, M1, rtol=1e-5) - - V1 = x.var() - V2 = stats.circvar(x, high=360) - assert_allclose(V2, V1, rtol=1e-4) - - S1 = x.std() - S2 = stats.circstd(x, high=360) - assert_allclose(S2, S1, rtol=1e-4) - - def test_circmean_axis(self): - x = np.array([[355,5,2,359,10,350], - [351,7,4,352,9,349], - [357,9,8,358,4,356]]) - M1 = stats.circmean(x, high=360) - M2 = stats.circmean(x.ravel(), high=360) - assert_allclose(M1, M2, rtol=1e-14) - - M1 = stats.circmean(x, high=360, axis=1) - M2 = [stats.circmean(x[i], high=360) for i in range(x.shape[0])] - assert_allclose(M1, M2, rtol=1e-14) - - M1 = stats.circmean(x, high=360, axis=0) - M2 = [stats.circmean(x[:,i], high=360) for i in range(x.shape[1])] - assert_allclose(M1, M2, rtol=1e-14) - - def test_circvar_axis(self): - x = np.array([[355,5,2,359,10,350], - [351,7,4,352,9,349], - [357,9,8,358,4,356]]) - - V1 = stats.circvar(x, high=360) - V2 = stats.circvar(x.ravel(), high=360) - assert_allclose(V1, V2, rtol=1e-11) - - V1 = stats.circvar(x, high=360, axis=1) - V2 = [stats.circvar(x[i], high=360) for i in range(x.shape[0])] - assert_allclose(V1, V2, rtol=1e-11) - - V1 = stats.circvar(x, high=360, axis=0) - V2 = [stats.circvar(x[:,i], high=360) for i in range(x.shape[1])] - assert_allclose(V1, V2, rtol=1e-11) - - def test_circstd_axis(self): - x = np.array([[355,5,2,359,10,350], - [351,7,4,352,9,349], - [357,9,8,358,4,356]]) - - S1 = stats.circstd(x, high=360) - S2 = stats.circstd(x.ravel(), high=360) - assert_allclose(S1, S2, rtol=1e-11) - - S1 = stats.circstd(x, high=360, axis=1) - S2 = [stats.circstd(x[i], high=360) for i in range(x.shape[0])] - assert_allclose(S1, S2, rtol=1e-11) - - S1 = stats.circstd(x, high=360, axis=0) - S2 = [stats.circstd(x[:,i], high=360) for i in range(x.shape[1])] - assert_allclose(S1, S2, rtol=1e-11) - - def test_circfuncs_array_like(self): - x = [355,5,2,359,10,350] - assert_allclose(stats.circmean(x, high=360), 0.167690146, rtol=1e-7) - assert_allclose(stats.circvar(x, high=360), 42.51955609, rtol=1e-7) - assert_allclose(stats.circstd(x, high=360), 6.520702116, rtol=1e-7) - - def test_empty(self): - assert_(np.isnan(stats.circmean([]))) - assert_(np.isnan(stats.circstd([]))) - assert_(np.isnan(stats.circvar([]))) - - -def test_accuracy_wilcoxon(): - freq = [1, 4, 16, 15, 8, 4, 5, 1, 2] - nums = range(-4, 5) - x = np.concatenate([[u] * v for u, v in zip(nums, freq)]) - y = np.zeros(x.size) - - T, p = stats.wilcoxon(x, y, "pratt") - assert_allclose(T, 423) - assert_allclose(p, 0.00197547303533107) - - T, p = stats.wilcoxon(x, y, "zsplit") - assert_allclose(T, 441) - assert_allclose(p, 0.0032145343172473055) - - T, p = stats.wilcoxon(x, y, "wilcox") - assert_allclose(T, 327) - assert_allclose(p, 0.00641346115861) - - # Test the 'correction' option, using values computed in R with: - # > wilcox.test(x, y, paired=TRUE, exact=FALSE, correct={FALSE,TRUE}) - x = np.array([120, 114, 181, 188, 180, 146, 121, 191, 132, 113, 127, 112]) - y = np.array([133, 143, 119, 189, 112, 199, 198, 113, 115, 121, 142, 187]) - T, p = stats.wilcoxon(x, y, correction=False) - assert_equal(T, 34) - assert_allclose(p, 0.6948866, rtol=1e-6) - T, p = stats.wilcoxon(x, y, correction=True) - assert_equal(T, 34) - assert_allclose(p, 0.7240817, rtol=1e-6) - - -def test_wilcoxon_tie(): - # Regression test for gh-2391. - # Corresponding R code is: - # > result = wilcox.test(rep(0.1, 10), exact=FALSE, correct=FALSE) - # > result$p.value - # [1] 0.001565402 - # > result = wilcox.test(rep(0.1, 10), exact=FALSE, correct=TRUE) - # > result$p.value - # [1] 0.001904195 - stat, p = stats.wilcoxon([0.1] * 10) - expected_p = 0.001565402 - assert_equal(stat, 0) - assert_allclose(p, expected_p, rtol=1e-6) - - stat, p = stats.wilcoxon([0.1] * 10, correction=True) - expected_p = 0.001904195 - assert_equal(stat, 0) - assert_allclose(p, expected_p, rtol=1e-6) - - -class TestMedianTest(TestCase): - - def test_bad_n_samples(self): - # median_test requires at least two samples. - assert_raises(ValueError, stats.median_test, [1, 2, 3]) - - def test_empty_sample(self): - # Each sample must contain at least one value. - assert_raises(ValueError, stats.median_test, [], [1, 2, 3]) - - def test_empty_when_ties_ignored(self): - # The grand median is 1, and all values in the first argument are - # equal to the grand median. With ties="ignore", those values are - # ignored, which results in the first sample being (in effect) empty. - # This should raise a ValueError. - assert_raises(ValueError, stats.median_test, - [1, 1, 1, 1], [2, 0, 1], [2, 0], ties="ignore") - - def test_empty_contingency_row(self): - # The grand median is 1, and with the default ties="below", all the - # values in the samples are counted as being below the grand median. - # This would result a row of zeros in the contingency table, which is - # an error. - assert_raises(ValueError, stats.median_test, [1, 1, 1], [1, 1, 1]) - - # With ties="above", all the values are counted as above the - # grand median. - assert_raises(ValueError, stats.median_test, [1, 1, 1], [1, 1, 1], - ties="above") - - def test_bad_ties(self): - assert_raises(ValueError, stats.median_test, [1, 2, 3], [4, 5], ties="foo") - - def test_bad_keyword(self): - assert_raises(TypeError, stats.median_test, [1, 2, 3], [4, 5], foo="foo") - - def test_simple(self): - x = [1, 2, 3] - y = [1, 2, 3] - stat, p, med, tbl = stats.median_test(x, y) - - # The median is floating point, but this equality test should be safe. - assert_equal(med, 2.0) - - assert_array_equal(tbl, [[1, 1], [2, 2]]) - - # The expected values of the contingency table equal the contingency table, - # so the statistic should be 0 and the p-value should be 1. - assert_equal(stat, 0) - assert_equal(p, 1) - - def test_ties_options(self): - # Test the contingency table calculation. - x = [1, 2, 3, 4] - y = [5, 6] - z = [7, 8, 9] - # grand median is 5. - - # Default 'ties' option is "below". - stat, p, m, tbl = stats.median_test(x, y, z) - assert_equal(m, 5) - assert_equal(tbl, [[0, 1, 3], [4, 1, 0]]) - - stat, p, m, tbl = stats.median_test(x, y, z, ties="ignore") - assert_equal(m, 5) - assert_equal(tbl, [[0, 1, 3], [4, 0, 0]]) - - stat, p, m, tbl = stats.median_test(x, y, z, ties="above") - assert_equal(m, 5) - assert_equal(tbl, [[0, 2, 3], [4, 0, 0]]) - - def test_basic(self): - # median_test calls chi2_contingency to compute the test statistic - # and p-value. Make sure it hasn't screwed up the call... - - x = [1, 2, 3, 4, 5] - y = [2, 4, 6, 8] - - stat, p, m, tbl = stats.median_test(x, y) - assert_equal(m, 4) - assert_equal(tbl, [[1, 2], [4, 2]]) - - exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl) - assert_allclose(stat, exp_stat) - assert_allclose(p, exp_p) - - stat, p, m, tbl = stats.median_test(x, y, lambda_=0) - assert_equal(m, 4) - assert_equal(tbl, [[1, 2], [4, 2]]) - - exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, lambda_=0) - assert_allclose(stat, exp_stat) - assert_allclose(p, exp_p) - - stat, p, m, tbl = stats.median_test(x, y, correction=False) - assert_equal(m, 4) - assert_equal(tbl, [[1, 2], [4, 2]]) - - exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, correction=False) - assert_allclose(stat, exp_stat) - assert_allclose(p, exp_p) - - -if __name__ == "__main__": - run_module_suite() diff --git a/wafo/stats/tests/test_mstats_basic.py b/wafo/stats/tests/test_mstats_basic.py deleted file mode 100644 index 1f242e4..0000000 --- a/wafo/stats/tests/test_mstats_basic.py +++ /dev/null @@ -1,1055 +0,0 @@ -""" -Tests for the stats.mstats module (support for masked arrays) -""" -from __future__ import division, print_function, absolute_import - -import warnings - -import numpy as np -from numpy import nan -import numpy.ma as ma -from numpy.ma import masked, nomask - -import wafo.stats.mstats as mstats -from wafo import stats -from numpy.testing import TestCase, run_module_suite -from numpy.testing.decorators import skipif -from numpy.ma.testutils import (assert_equal, assert_almost_equal, - assert_array_almost_equal, assert_array_almost_equal_nulp, assert_, - assert_allclose, assert_raises) - - -class TestMquantiles(TestCase): - def test_mquantiles_limit_keyword(self): - # Regression test for Trac ticket #867 - data = np.array([[6., 7., 1.], - [47., 15., 2.], - [49., 36., 3.], - [15., 39., 4.], - [42., 40., -999.], - [41., 41., -999.], - [7., -999., -999.], - [39., -999., -999.], - [43., -999., -999.], - [40., -999., -999.], - [36., -999., -999.]]) - desired = [[19.2, 14.6, 1.45], - [40.0, 37.5, 2.5], - [42.8, 40.05, 3.55]] - quants = mstats.mquantiles(data, axis=0, limit=(0, 50)) - assert_almost_equal(quants, desired) - - -class TestGMean(TestCase): - def test_1D(self): - a = (1,2,3,4) - actual = mstats.gmean(a) - desired = np.power(1*2*3*4,1./4.) - assert_almost_equal(actual, desired, decimal=14) - - desired1 = mstats.gmean(a,axis=-1) - assert_almost_equal(actual, desired1, decimal=14) - assert_(not isinstance(desired1, ma.MaskedArray)) - - a = ma.array((1,2,3,4),mask=(0,0,0,1)) - actual = mstats.gmean(a) - desired = np.power(1*2*3,1./3.) - assert_almost_equal(actual, desired,decimal=14) - - desired1 = mstats.gmean(a,axis=-1) - assert_almost_equal(actual, desired1, decimal=14) - - @skipif(not hasattr(np, 'float96'), 'cannot find float96 so skipping') - def test_1D_float96(self): - a = ma.array((1,2,3,4), mask=(0,0,0,1)) - actual_dt = mstats.gmean(a, dtype=np.float96) - desired_dt = np.power(1 * 2 * 3, 1. / 3.).astype(np.float96) - assert_almost_equal(actual_dt, desired_dt, decimal=14) - assert_(actual_dt.dtype == desired_dt.dtype) - - def test_2D(self): - a = ma.array(((1, 2, 3, 4), (1, 2, 3, 4), (1, 2, 3, 4)), - mask=((0, 0, 0, 0), (1, 0, 0, 1), (0, 1, 1, 0))) - actual = mstats.gmean(a) - desired = np.array((1,2,3,4)) - assert_array_almost_equal(actual, desired, decimal=14) - - desired1 = mstats.gmean(a,axis=0) - assert_array_almost_equal(actual, desired1, decimal=14) - - actual = mstats.gmean(a, -1) - desired = ma.array((np.power(1*2*3*4,1./4.), - np.power(2*3,1./2.), - np.power(1*4,1./2.))) - assert_array_almost_equal(actual, desired, decimal=14) - - -class TestHMean(TestCase): - def test_1D(self): - a = (1,2,3,4) - actual = mstats.hmean(a) - desired = 4. / (1./1 + 1./2 + 1./3 + 1./4) - assert_almost_equal(actual, desired, decimal=14) - desired1 = mstats.hmean(ma.array(a),axis=-1) - assert_almost_equal(actual, desired1, decimal=14) - - a = ma.array((1,2,3,4),mask=(0,0,0,1)) - actual = mstats.hmean(a) - desired = 3. / (1./1 + 1./2 + 1./3) - assert_almost_equal(actual, desired,decimal=14) - desired1 = mstats.hmean(a,axis=-1) - assert_almost_equal(actual, desired1, decimal=14) - - @skipif(not hasattr(np, 'float96'), 'cannot find float96 so skipping') - def test_1D_float96(self): - a = ma.array((1,2,3,4), mask=(0,0,0,1)) - actual_dt = mstats.hmean(a, dtype=np.float96) - desired_dt = np.asarray(3. / (1./1 + 1./2 + 1./3), - dtype=np.float96) - assert_almost_equal(actual_dt, desired_dt, decimal=14) - assert_(actual_dt.dtype == desired_dt.dtype) - - def test_2D(self): - a = ma.array(((1,2,3,4),(1,2,3,4),(1,2,3,4)), - mask=((0,0,0,0),(1,0,0,1),(0,1,1,0))) - actual = mstats.hmean(a) - desired = ma.array((1,2,3,4)) - assert_array_almost_equal(actual, desired, decimal=14) - - actual1 = mstats.hmean(a,axis=-1) - desired = (4./(1/1.+1/2.+1/3.+1/4.), - 2./(1/2.+1/3.), - 2./(1/1.+1/4.) - ) - assert_array_almost_equal(actual1, desired, decimal=14) - - -class TestRanking(TestCase): - def __init__(self, *args, **kwargs): - TestCase.__init__(self, *args, **kwargs) - - def test_ranking(self): - x = ma.array([0,1,1,1,2,3,4,5,5,6,]) - assert_almost_equal(mstats.rankdata(x), - [1,3,3,3,5,6,7,8.5,8.5,10]) - x[[3,4]] = masked - assert_almost_equal(mstats.rankdata(x), - [1,2.5,2.5,0,0,4,5,6.5,6.5,8]) - assert_almost_equal(mstats.rankdata(x, use_missing=True), - [1,2.5,2.5,4.5,4.5,4,5,6.5,6.5,8]) - x = ma.array([0,1,5,1,2,4,3,5,1,6,]) - assert_almost_equal(mstats.rankdata(x), - [1,3,8.5,3,5,7,6,8.5,3,10]) - x = ma.array([[0,1,1,1,2], [3,4,5,5,6,]]) - assert_almost_equal(mstats.rankdata(x), - [[1,3,3,3,5], [6,7,8.5,8.5,10]]) - assert_almost_equal(mstats.rankdata(x, axis=1), - [[1,3,3,3,5], [1,2,3.5,3.5,5]]) - assert_almost_equal(mstats.rankdata(x,axis=0), - [[1,1,1,1,1], [2,2,2,2,2,]]) - - -class TestCorr(TestCase): - def test_pearsonr(self): - # Tests some computations of Pearson's r - x = ma.arange(10) - with warnings.catch_warnings(): - # The tests in this context are edge cases, with perfect - # correlation or anticorrelation, or totally masked data. - # None of these should trigger a RuntimeWarning. - warnings.simplefilter("error", RuntimeWarning) - - assert_almost_equal(mstats.pearsonr(x, x)[0], 1.0) - assert_almost_equal(mstats.pearsonr(x, x[::-1])[0], -1.0) - - x = ma.array(x, mask=True) - pr = mstats.pearsonr(x, x) - assert_(pr[0] is masked) - assert_(pr[1] is masked) - - x1 = ma.array([-1.0, 0.0, 1.0]) - y1 = ma.array([0, 0, 3]) - r, p = mstats.pearsonr(x1, y1) - assert_almost_equal(r, np.sqrt(3)/2) - assert_almost_equal(p, 1.0/3) - - # (x2, y2) have the same unmasked data as (x1, y1). - mask = [False, False, False, True] - x2 = ma.array([-1.0, 0.0, 1.0, 99.0], mask=mask) - y2 = ma.array([0, 0, 3, -1], mask=mask) - r, p = mstats.pearsonr(x2, y2) - assert_almost_equal(r, np.sqrt(3)/2) - assert_almost_equal(p, 1.0/3) - - def test_spearmanr(self): - # Tests some computations of Spearman's rho - (x, y) = ([5.05,6.75,3.21,2.66],[1.65,2.64,2.64,6.95]) - assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555) - (x, y) = ([5.05,6.75,3.21,2.66,np.nan],[1.65,2.64,2.64,6.95,np.nan]) - (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) - assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555) - - x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, - 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7] - y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, - 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4] - assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299) - x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, - 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7, np.nan] - y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, - 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4, np.nan] - (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) - assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299) - - def test_kendalltau(self): - # Tests some computations of Kendall's tau - x = ma.fix_invalid([5.05, 6.75, 3.21, 2.66,np.nan]) - y = ma.fix_invalid([1.65, 26.5, -5.93, 7.96, np.nan]) - z = ma.fix_invalid([1.65, 2.64, 2.64, 6.95, np.nan]) - assert_almost_equal(np.asarray(mstats.kendalltau(x,y)), - [+0.3333333,0.4969059]) - assert_almost_equal(np.asarray(mstats.kendalltau(x,z)), - [-0.5477226,0.2785987]) - # - x = ma.fix_invalid([0, 0, 0, 0,20,20, 0,60, 0,20, - 10,10, 0,40, 0,20, 0, 0, 0, 0, 0, np.nan]) - y = ma.fix_invalid([0,80,80,80,10,33,60, 0,67,27, - 25,80,80,80,80,80,80, 0,10,45, np.nan, 0]) - result = mstats.kendalltau(x,y) - assert_almost_equal(np.asarray(result), [-0.1585188, 0.4128009]) - - def test_kendalltau_seasonal(self): - # Tests the seasonal Kendall tau. - x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1], - [4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3], - [3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan], - [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]] - x = ma.fix_invalid(x).T - output = mstats.kendalltau_seasonal(x) - assert_almost_equal(output['global p-value (indep)'], 0.008, 3) - assert_almost_equal(output['seasonal p-value'].round(2), - [0.18,0.53,0.20,0.04]) - - def test_pointbiserial(self): - x = [1,0,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0, - 0,0,0,0,1,-1] - y = [14.8,13.8,12.4,10.1,7.1,6.1,5.8,4.6,4.3,3.5,3.3,3.2,3.0, - 2.8,2.8,2.5,2.4,2.3,2.1,1.7,1.7,1.5,1.3,1.3,1.2,1.2,1.1, - 0.8,0.7,0.6,0.5,0.2,0.2,0.1,np.nan] - assert_almost_equal(mstats.pointbiserialr(x, y)[0], 0.36149, 5) - - -class TestTrimming(TestCase): - - def test_trim(self): - a = ma.arange(10) - assert_equal(mstats.trim(a), [0,1,2,3,4,5,6,7,8,9]) - a = ma.arange(10) - assert_equal(mstats.trim(a,(2,8)), [None,None,2,3,4,5,6,7,8,None]) - a = ma.arange(10) - assert_equal(mstats.trim(a,limits=(2,8),inclusive=(False,False)), - [None,None,None,3,4,5,6,7,None,None]) - a = ma.arange(10) - assert_equal(mstats.trim(a,limits=(0.1,0.2),relative=True), - [None,1,2,3,4,5,6,7,None,None]) - - a = ma.arange(12) - a[[0,-1]] = a[5] = masked - assert_equal(mstats.trim(a, (2,8)), - [None, None, 2, 3, 4, None, 6, 7, 8, None, None, None]) - - x = ma.arange(100).reshape(10, 10) - expected = [1]*10 + [0]*70 + [1]*20 - trimx = mstats.trim(x, (0.1,0.2), relative=True, axis=None) - assert_equal(trimx._mask.ravel(), expected) - trimx = mstats.trim(x, (0.1,0.2), relative=True, axis=0) - assert_equal(trimx._mask.ravel(), expected) - trimx = mstats.trim(x, (0.1,0.2), relative=True, axis=-1) - assert_equal(trimx._mask.T.ravel(), expected) - - # same as above, but with an extra masked row inserted - x = ma.arange(110).reshape(11, 10) - x[1] = masked - expected = [1]*20 + [0]*70 + [1]*20 - trimx = mstats.trim(x, (0.1,0.2), relative=True, axis=None) - assert_equal(trimx._mask.ravel(), expected) - trimx = mstats.trim(x, (0.1,0.2), relative=True, axis=0) - assert_equal(trimx._mask.ravel(), expected) - trimx = mstats.trim(x.T, (0.1,0.2), relative=True, axis=-1) - assert_equal(trimx.T._mask.ravel(), expected) - - def test_trim_old(self): - x = ma.arange(100) - assert_equal(mstats.trimboth(x).count(), 60) - assert_equal(mstats.trimtail(x,tail='r').count(), 80) - x[50:70] = masked - trimx = mstats.trimboth(x) - assert_equal(trimx.count(), 48) - assert_equal(trimx._mask, [1]*16 + [0]*34 + [1]*20 + [0]*14 + [1]*16) - x._mask = nomask - x.shape = (10,10) - assert_equal(mstats.trimboth(x).count(), 60) - assert_equal(mstats.trimtail(x).count(), 80) - - def test_trimmedmean(self): - data = ma.array([77, 87, 88,114,151,210,219,246,253,262, - 296,299,306,376,428,515,666,1310,2611]) - assert_almost_equal(mstats.trimmed_mean(data,0.1), 343, 0) - assert_almost_equal(mstats.trimmed_mean(data,(0.1,0.1)), 343, 0) - assert_almost_equal(mstats.trimmed_mean(data,(0.2,0.2)), 283, 0) - - def test_trimmed_stde(self): - data = ma.array([77, 87, 88,114,151,210,219,246,253,262, - 296,299,306,376,428,515,666,1310,2611]) - assert_almost_equal(mstats.trimmed_stde(data,(0.2,0.2)), 56.13193, 5) - assert_almost_equal(mstats.trimmed_stde(data,0.2), 56.13193, 5) - - def test_winsorization(self): - data = ma.array([77, 87, 88,114,151,210,219,246,253,262, - 296,299,306,376,428,515,666,1310,2611]) - assert_almost_equal(mstats.winsorize(data,(0.2,0.2)).var(ddof=1), - 21551.4, 1) - data[5] = masked - winsorized = mstats.winsorize(data) - assert_equal(winsorized.mask, data.mask) - - -class TestMoments(TestCase): - # Comparison numbers are found using R v.1.5.1 - # note that length(testcase) = 4 - # testmathworks comes from documentation for the - # Statistics Toolbox for Matlab and can be found at both - # http://www.mathworks.com/access/helpdesk/help/toolbox/stats/kurtosis.shtml - # http://www.mathworks.com/access/helpdesk/help/toolbox/stats/skewness.shtml - # Note that both test cases came from here. - testcase = [1,2,3,4] - testmathworks = ma.fix_invalid([1.165, 0.6268, 0.0751, 0.3516, -0.6965, - np.nan]) - testcase_2d = ma.array( - np.array([[0.05245846, 0.50344235, 0.86589117, 0.36936353, 0.46961149], - [0.11574073, 0.31299969, 0.45925772, 0.72618805, 0.75194407], - [0.67696689, 0.91878127, 0.09769044, 0.04645137, 0.37615733], - [0.05903624, 0.29908861, 0.34088298, 0.66216337, 0.83160998], - [0.64619526, 0.94894632, 0.27855892, 0.0706151, 0.39962917]]), - mask=np.array([[True, False, False, True, False], - [True, True, True, False, True], - [False, False, False, False, False], - [True, True, True, True, True], - [False, False, True, False, False]], dtype=np.bool)) - - def test_moment(self): - y = mstats.moment(self.testcase,1) - assert_almost_equal(y,0.0,10) - y = mstats.moment(self.testcase,2) - assert_almost_equal(y,1.25) - y = mstats.moment(self.testcase,3) - assert_almost_equal(y,0.0) - y = mstats.moment(self.testcase,4) - assert_almost_equal(y,2.5625) - - def test_variation(self): - y = mstats.variation(self.testcase) - assert_almost_equal(y,0.44721359549996, 10) - - def test_skewness(self): - y = mstats.skew(self.testmathworks) - assert_almost_equal(y,-0.29322304336607,10) - y = mstats.skew(self.testmathworks,bias=0) - assert_almost_equal(y,-0.437111105023940,10) - y = mstats.skew(self.testcase) - assert_almost_equal(y,0.0,10) - - def test_kurtosis(self): - # Set flags for axis = 0 and fisher=0 (Pearson's definition of kurtosis - # for compatibility with Matlab) - y = mstats.kurtosis(self.testmathworks,0,fisher=0,bias=1) - assert_almost_equal(y, 2.1658856802973,10) - # Note that MATLAB has confusing docs for the following case - # kurtosis(x,0) gives an unbiased estimate of Pearson's skewness - # kurtosis(x) gives a biased estimate of Fisher's skewness (Pearson-3) - # The MATLAB docs imply that both should give Fisher's - y = mstats.kurtosis(self.testmathworks,fisher=0, bias=0) - assert_almost_equal(y, 3.663542721189047,10) - y = mstats.kurtosis(self.testcase,0,0) - assert_almost_equal(y,1.64) - - # test that kurtosis works on multidimensional masked arrays - correct_2d = ma.array(np.array([-1.5, -3., -1.47247052385, 0., - -1.26979517952]), - mask=np.array([False, False, False, True, - False], dtype=np.bool)) - assert_array_almost_equal(mstats.kurtosis(self.testcase_2d, 1), - correct_2d) - for i, row in enumerate(self.testcase_2d): - assert_almost_equal(mstats.kurtosis(row), correct_2d[i]) - - correct_2d_bias_corrected = ma.array( - np.array([-1.5, -3., -1.88988209538, 0., -0.5234638463918877]), - mask=np.array([False, False, False, True, False], dtype=np.bool)) - assert_array_almost_equal(mstats.kurtosis(self.testcase_2d, 1, - bias=False), - correct_2d_bias_corrected) - for i, row in enumerate(self.testcase_2d): - assert_almost_equal(mstats.kurtosis(row, bias=False), - correct_2d_bias_corrected[i]) - - # Check consistency between stats and mstats implementations - assert_array_almost_equal_nulp(mstats.kurtosis(self.testcase_2d[2, :]), - stats.kurtosis(self.testcase_2d[2, :])) - - def test_mode(self): - a1 = [0,0,0,1,1,1,2,3,3,3,3,4,5,6,7] - a2 = np.reshape(a1, (3,5)) - a3 = np.array([1,2,3,4,5,6]) - a4 = np.reshape(a3, (3,2)) - ma1 = ma.masked_where(ma.array(a1) > 2, a1) - ma2 = ma.masked_where(a2 > 2, a2) - ma3 = ma.masked_where(a3 < 2, a3) - ma4 = ma.masked_where(ma.array(a4) < 2, a4) - assert_equal(mstats.mode(a1, axis=None), (3,4)) - assert_equal(mstats.mode(a1, axis=0), (3,4)) - assert_equal(mstats.mode(ma1, axis=None), (0,3)) - assert_equal(mstats.mode(a2, axis=None), (3,4)) - assert_equal(mstats.mode(ma2, axis=None), (0,3)) - assert_equal(mstats.mode(a3, axis=None), (1,1)) - assert_equal(mstats.mode(ma3, axis=None), (2,1)) - assert_equal(mstats.mode(a2, axis=0), ([[0,0,0,1,1]], [[1,1,1,1,1]])) - assert_equal(mstats.mode(ma2, axis=0), ([[0,0,0,1,1]], [[1,1,1,1,1]])) - assert_equal(mstats.mode(a2, axis=-1), ([[0],[3],[3]], [[3],[3],[1]])) - assert_equal(mstats.mode(ma2, axis=-1), ([[0],[1],[0]], [[3],[1],[0]])) - assert_equal(mstats.mode(ma4, axis=0), ([[3,2]], [[1,1]])) - assert_equal(mstats.mode(ma4, axis=-1), ([[2],[3],[5]], [[1],[1],[1]])) - - -class TestPercentile(TestCase): - def setUp(self): - self.a1 = [3,4,5,10,-3,-5,6] - self.a2 = [3,-6,-2,8,7,4,2,1] - self.a3 = [3.,4,5,10,-3,-5,-6,7.0] - - def test_percentile(self): - x = np.arange(8) * 0.5 - assert_equal(mstats.scoreatpercentile(x, 0), 0.) - assert_equal(mstats.scoreatpercentile(x, 100), 3.5) - assert_equal(mstats.scoreatpercentile(x, 50), 1.75) - - def test_2D(self): - x = ma.array([[1, 1, 1], - [1, 1, 1], - [4, 4, 3], - [1, 1, 1], - [1, 1, 1]]) - assert_equal(mstats.scoreatpercentile(x,50), [1,1,1]) - - -class TestVariability(TestCase): - """ Comparison numbers are found using R v.1.5.1 - note that length(testcase) = 4 - """ - testcase = ma.fix_invalid([1,2,3,4,np.nan]) - - def test_signaltonoise(self): - # This is not in R, so used: - # mean(testcase, axis=0) / (sqrt(var(testcase)*3/4)) - y = mstats.signaltonoise(self.testcase) - assert_almost_equal(y, 2.236067977) - - def test_sem(self): - # This is not in R, so used: sqrt(var(testcase)*3/4) / sqrt(3) - y = mstats.sem(self.testcase) - assert_almost_equal(y, 0.6454972244) - n = self.testcase.count() - assert_allclose(mstats.sem(self.testcase, ddof=0) * np.sqrt(n/(n-2)), - mstats.sem(self.testcase, ddof=2)) - - def test_zmap(self): - # This is not in R, so tested by using: - # (testcase[i]-mean(testcase,axis=0)) / sqrt(var(testcase)*3/4) - y = mstats.zmap(self.testcase, self.testcase) - desired_unmaskedvals = ([-1.3416407864999, -0.44721359549996, - 0.44721359549996, 1.3416407864999]) - assert_array_almost_equal(desired_unmaskedvals, - y.data[y.mask == False], decimal=12) - - def test_zscore(self): - # This is not in R, so tested by using: - # (testcase[i]-mean(testcase,axis=0)) / sqrt(var(testcase)*3/4) - y = mstats.zscore(self.testcase) - desired = ma.fix_invalid([-1.3416407864999, -0.44721359549996, - 0.44721359549996, 1.3416407864999, np.nan]) - assert_almost_equal(desired, y, decimal=12) - - -class TestMisc(TestCase): - - def test_obrientransform(self): - args = [[5]*5+[6]*11+[7]*9+[8]*3+[9]*2+[10]*2, - [6]+[7]*2+[8]*4+[9]*9+[10]*16] - result = [5*[3.1828]+11*[0.5591]+9*[0.0344]+3*[1.6086]+2*[5.2817]+2*[11.0538], - [10.4352]+2*[4.8599]+4*[1.3836]+9*[0.0061]+16*[0.7277]] - assert_almost_equal(np.round(mstats.obrientransform(*args).T,4), - result,4) - - def test_kstwosamp(self): - x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1], - [4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3], - [3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan], - [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]] - x = ma.fix_invalid(x).T - (winter,spring,summer,fall) = x.T - - assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring),4), - (0.1818,0.9892)) - assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring,'g'),4), - (0.1469,0.7734)) - assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring,'l'),4), - (0.1818,0.6744)) - - def test_friedmanchisq(self): - # No missing values - args = ([9.0,9.5,5.0,7.5,9.5,7.5,8.0,7.0,8.5,6.0], - [7.0,6.5,7.0,7.5,5.0,8.0,6.0,6.5,7.0,7.0], - [6.0,8.0,4.0,6.0,7.0,6.5,6.0,4.0,6.5,3.0]) - result = mstats.friedmanchisquare(*args) - assert_almost_equal(result[0], 10.4737, 4) - assert_almost_equal(result[1], 0.005317, 6) - # Missing values - x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1], - [4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3], - [3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan], - [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]] - x = ma.fix_invalid(x) - result = mstats.friedmanchisquare(*x) - assert_almost_equal(result[0], 2.0156, 4) - assert_almost_equal(result[1], 0.5692, 4) - - -def test_regress_simple(): - # Regress a line with sinusoidal noise. Test for #1273. - x = np.linspace(0, 100, 100) - y = 0.2 * np.linspace(0, 100, 100) + 10 - y += np.sin(np.linspace(0, 20, 100)) - - slope, intercept, r_value, p_value, sterr = mstats.linregress(x, y) - assert_almost_equal(slope, 0.19644990055858422) - assert_almost_equal(intercept, 10.211269918932341) - - -def test_theilslopes(): - # Test for basic slope and intercept. - slope, intercept, lower, upper = mstats.theilslopes([0,1,1]) - assert_almost_equal(slope, 0.5) - assert_almost_equal(intercept, 0.5) - - # Test for correct masking. - y = np.ma.array([0,1,100,1], mask=[False, False, True, False]) - slope, intercept, lower, upper = mstats.theilslopes(y) - assert_almost_equal(slope, 1./3) - assert_almost_equal(intercept, 2./3) - - # Test of confidence intervals from example in Sen (1968). - x = [1, 2, 3, 4, 10, 12, 18] - y = [9, 15, 19, 20, 45, 55, 78] - slope, intercept, lower, upper = mstats.theilslopes(y, x, 0.07) - assert_almost_equal(slope, 4) - assert_almost_equal(upper, 4.38, decimal=2) - assert_almost_equal(lower, 3.71, decimal=2) - - -def test_plotting_positions(): - # Regression test for #1256 - pos = mstats.plotting_positions(np.arange(3), 0, 0) - assert_array_almost_equal(pos.data, np.array([0.25, 0.5, 0.75])) - - -class TestNormalitytests(): - - def test_vs_nonmasked(self): - x = np.array((-2,-1,0,1,2,3)*4)**2 - assert_array_almost_equal(mstats.normaltest(x), - stats.normaltest(x)) - assert_array_almost_equal(mstats.skewtest(x), - stats.skewtest(x)) - assert_array_almost_equal(mstats.kurtosistest(x), - stats.kurtosistest(x)) - - funcs = [stats.normaltest, stats.skewtest, stats.kurtosistest] - mfuncs = [mstats.normaltest, mstats.skewtest, mstats.kurtosistest] - x = [1, 2, 3, 4] - for func, mfunc in zip(funcs, mfuncs): - assert_raises(ValueError, func, x) - assert_raises(ValueError, mfunc, x) - - def test_axis_None(self): - # Test axis=None (equal to axis=0 for 1-D input) - x = np.array((-2,-1,0,1,2,3)*4)**2 - assert_allclose(mstats.normaltest(x, axis=None), mstats.normaltest(x)) - assert_allclose(mstats.skewtest(x, axis=None), mstats.skewtest(x)) - assert_allclose(mstats.kurtosistest(x, axis=None), - mstats.kurtosistest(x)) - - def test_maskedarray_input(self): - # Add some masked values, test result doesn't change - x = np.array((-2,-1,0,1,2,3)*4)**2 - xm = np.ma.array(np.r_[np.inf, x, 10], - mask=np.r_[True, [False] * x.size, True]) - assert_allclose(mstats.normaltest(xm), stats.normaltest(x)) - assert_allclose(mstats.skewtest(xm), stats.skewtest(x)) - assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x)) - - def test_nd_input(self): - x = np.array((-2,-1,0,1,2,3)*4)**2 - x_2d = np.vstack([x] * 2).T - for func in [mstats.normaltest, mstats.skewtest, mstats.kurtosistest]: - res_1d = func(x) - res_2d = func(x_2d) - assert_allclose(res_2d[0], [res_1d[0]] * 2) - assert_allclose(res_2d[1], [res_1d[1]] * 2) - - -#TODO: for all ttest functions, add tests with masked array inputs -class TestTtest_rel(): - def test_vs_nonmasked(self): - np.random.seed(1234567) - outcome = np.random.randn(20, 4) + [0, 0, 1, 2] - - # 1-D inputs - res1 = stats.ttest_rel(outcome[:, 0], outcome[:, 1]) - res2 = mstats.ttest_rel(outcome[:, 0], outcome[:, 1]) - assert_allclose(res1, res2) - - # 2-D inputs - res1 = stats.ttest_rel(outcome[:, 0], outcome[:, 1], axis=None) - res2 = mstats.ttest_rel(outcome[:, 0], outcome[:, 1], axis=None) - assert_allclose(res1, res2) - res1 = stats.ttest_rel(outcome[:, :2], outcome[:, 2:], axis=0) - res2 = mstats.ttest_rel(outcome[:, :2], outcome[:, 2:], axis=0) - assert_allclose(res1, res2) - - # Check default is axis=0 - res3 = mstats.ttest_rel(outcome[:, :2], outcome[:, 2:]) - assert_allclose(res2, res3) - - def test_invalid_input_size(self): - assert_raises(ValueError, mstats.ttest_rel, - np.arange(10), np.arange(11)) - x = np.arange(24) - assert_raises(ValueError, mstats.ttest_rel, - x.reshape(2, 3, 4), x.reshape(2, 4, 3), axis=1) - assert_raises(ValueError, mstats.ttest_rel, - x.reshape(2, 3, 4), x.reshape(2, 4, 3), axis=2) - - def test_empty(self): - res1 = mstats.ttest_rel([], []) - assert_(np.all(np.isnan(res1))) - - -class TestTtest_ind(): - def test_vs_nonmasked(self): - np.random.seed(1234567) - outcome = np.random.randn(20, 4) + [0, 0, 1, 2] - - # 1-D inputs - res1 = stats.ttest_ind(outcome[:, 0], outcome[:, 1]) - res2 = mstats.ttest_ind(outcome[:, 0], outcome[:, 1]) - assert_allclose(res1, res2) - - # 2-D inputs - res1 = stats.ttest_ind(outcome[:, 0], outcome[:, 1], axis=None) - res2 = mstats.ttest_ind(outcome[:, 0], outcome[:, 1], axis=None) - assert_allclose(res1, res2) - res1 = stats.ttest_ind(outcome[:, :2], outcome[:, 2:], axis=0) - res2 = mstats.ttest_ind(outcome[:, :2], outcome[:, 2:], axis=0) - assert_allclose(res1, res2) - - # Check default is axis=0 - res3 = mstats.ttest_ind(outcome[:, :2], outcome[:, 2:]) - assert_allclose(res2, res3) - - def test_empty(self): - res1 = mstats.ttest_ind([], []) - assert_(np.all(np.isnan(res1))) - - -class TestTtest_1samp(): - def test_vs_nonmasked(self): - np.random.seed(1234567) - outcome = np.random.randn(20, 4) + [0, 0, 1, 2] - - # 1-D inputs - res1 = stats.ttest_1samp(outcome[:, 0], 1) - res2 = mstats.ttest_1samp(outcome[:, 0], 1) - assert_allclose(res1, res2) - - # 2-D inputs - res1 = stats.ttest_1samp(outcome[:, 0], outcome[:, 1], axis=None) - res2 = mstats.ttest_1samp(outcome[:, 0], outcome[:, 1], axis=None) - assert_allclose(res1, res2) - res1 = stats.ttest_1samp(outcome[:, :2], outcome[:, 2:], axis=0) - res2 = mstats.ttest_1samp(outcome[:, :2], outcome[:, 2:], axis=0) - assert_allclose(res1, res2) - - # Check default is axis=0 - res3 = mstats.ttest_1samp(outcome[:, :2], outcome[:, 2:]) - assert_allclose(res2, res3) - - def test_empty(self): - res1 = mstats.ttest_1samp([], 1) - assert_(np.all(np.isnan(res1))) - - -class TestCompareWithStats(TestCase): - """ - Class to compare mstats results with stats results. - - It is in general assumed that scipy.stats is at a more mature stage than - stats.mstats. If a routine in mstats results in similar results like in - scipy.stats, this is considered also as a proper validation of scipy.mstats - routine. - - Different sample sizes are used for testing, as some problems between stats - and mstats are dependent on sample size. - - Author: Alexander Loew - - NOTE that some tests fail. This might be caused by - a) actual differences or bugs between stats and mstats - b) numerical inaccuracies - c) different definitions of routine interfaces - - These failures need to be checked. Current workaround is to have disabled these tests, - but issuing reports on scipy-dev - - """ - def get_n(self): - """ Returns list of sample sizes to be used for comparison. """ - return [1000, 100, 10, 5] - - def generate_xy_sample(self, n): - # This routine generates numpy arrays and corresponding masked arrays - # with the same data, but additional masked values - np.random.seed(1234567) - x = np.random.randn(n) - y = x + np.random.randn(n) - xm = np.ones(len(x) + 5) * 1e16 - ym = np.ones(len(y) + 5) * 1e16 - xm[0:len(x)] = x - ym[0:len(y)] = y - mask = xm > 9e15 - xm = np.ma.array(xm, mask=mask) - ym = np.ma.array(ym, mask=mask) - return x, y, xm, ym - - def generate_xy_sample2D(self, n, nx): - x = np.ones((n, nx)) * np.nan - y = np.ones((n, nx)) * np.nan - xm = np.ones((n+5, nx)) * np.nan - ym = np.ones((n+5, nx)) * np.nan - - for i in range(nx): - x[:,i], y[:,i], dx, dy = self.generate_xy_sample(n) - - xm[0:n, :] = x[0:n] - ym[0:n, :] = y[0:n] - xm = np.ma.array(xm, mask=np.isnan(xm)) - ym = np.ma.array(ym, mask=np.isnan(ym)) - return x, y, xm, ym - - def test_linregress(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - res1 = stats.linregress(x, y) - res2 = stats.mstats.linregress(xm, ym) - assert_allclose(np.asarray(res1), np.asarray(res2)) - - def test_pearsonr(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - r, p = stats.pearsonr(x, y) - rm, pm = stats.mstats.pearsonr(xm, ym) - - assert_almost_equal(r, rm, decimal=14) - assert_almost_equal(p, pm, decimal=14) - - def test_spearmanr(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - r, p = stats.spearmanr(x, y) - rm, pm = stats.mstats.spearmanr(xm, ym) - assert_almost_equal(r, rm, 14) - assert_almost_equal(p, pm, 14) - - def test_gmean(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - r = stats.gmean(abs(x)) - rm = stats.mstats.gmean(abs(xm)) - assert_allclose(r, rm, rtol=1e-13) - - r = stats.gmean(abs(y)) - rm = stats.mstats.gmean(abs(ym)) - assert_allclose(r, rm, rtol=1e-13) - - def test_hmean(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - - r = stats.hmean(abs(x)) - rm = stats.mstats.hmean(abs(xm)) - assert_almost_equal(r, rm, 10) - - r = stats.hmean(abs(y)) - rm = stats.mstats.hmean(abs(ym)) - assert_almost_equal(r, rm, 10) - - def test_skew(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - - r = stats.skew(x) - rm = stats.mstats.skew(xm) - assert_almost_equal(r, rm, 10) - - r = stats.skew(y) - rm = stats.mstats.skew(ym) - assert_almost_equal(r, rm, 10) - - def test_moment(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - - r = stats.moment(x) - rm = stats.mstats.moment(xm) - assert_almost_equal(r, rm, 10) - - r = stats.moment(y) - rm = stats.mstats.moment(ym) - assert_almost_equal(r, rm, 10) - - def test_signaltonoise(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - - r = stats.signaltonoise(x) - rm = stats.mstats.signaltonoise(xm) - assert_almost_equal(r, rm, 10) - - r = stats.signaltonoise(y) - rm = stats.mstats.signaltonoise(ym) - assert_almost_equal(r, rm, 10) - - def test_betai(self): - np.random.seed(12345) - for i in range(10): - a = np.random.rand() * 5. - b = np.random.rand() * 200. - assert_equal(stats.betai(a, b, 0.), 0.) - assert_equal(stats.betai(a, b, 1.), 1.) - assert_equal(stats.mstats.betai(a, b, 0.), 0.) - assert_equal(stats.mstats.betai(a, b, 1.), 1.) - x = np.random.rand() - assert_almost_equal(stats.betai(a, b, x), - stats.mstats.betai(a, b, x), decimal=13) - - def test_zscore(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - - #reference solution - zx = (x - x.mean()) / x.std() - zy = (y - y.mean()) / y.std() - - #validate stats - assert_allclose(stats.zscore(x), zx, rtol=1e-10) - assert_allclose(stats.zscore(y), zy, rtol=1e-10) - - #compare stats and mstats - assert_allclose(stats.zscore(x), stats.mstats.zscore(xm[0:len(x)]), - rtol=1e-10) - assert_allclose(stats.zscore(y), stats.mstats.zscore(ym[0:len(y)]), - rtol=1e-10) - - def test_kurtosis(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - r = stats.kurtosis(x) - rm = stats.mstats.kurtosis(xm) - assert_almost_equal(r, rm, 10) - - r = stats.kurtosis(y) - rm = stats.mstats.kurtosis(ym) - assert_almost_equal(r, rm, 10) - - def test_sem(self): - # example from stats.sem doc - a = np.arange(20).reshape(5,4) - am = np.ma.array(a) - r = stats.sem(a,ddof=1) - rm = stats.mstats.sem(am, ddof=1) - - assert_allclose(r, 2.82842712, atol=1e-5) - assert_allclose(rm, 2.82842712, atol=1e-5) - - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - assert_almost_equal(stats.mstats.sem(xm, axis=None, ddof=0), - stats.sem(x, axis=None, ddof=0), decimal=13) - assert_almost_equal(stats.mstats.sem(ym, axis=None, ddof=0), - stats.sem(y, axis=None, ddof=0), decimal=13) - assert_almost_equal(stats.mstats.sem(xm, axis=None, ddof=1), - stats.sem(x, axis=None, ddof=1), decimal=13) - assert_almost_equal(stats.mstats.sem(ym, axis=None, ddof=1), - stats.sem(y, axis=None, ddof=1), decimal=13) - - def test_describe(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - r = stats.describe(x, ddof=1) - rm = stats.mstats.describe(xm, ddof=1) - for ii in range(6): - assert_almost_equal(np.asarray(r[ii]), - np.asarray(rm[ii]), - decimal=12) - - def test_rankdata(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - r = stats.rankdata(x) - rm = stats.mstats.rankdata(x) - assert_allclose(r, rm) - - def test_tmean(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - assert_almost_equal(stats.tmean(x),stats.mstats.tmean(xm), 14) - assert_almost_equal(stats.tmean(y),stats.mstats.tmean(ym), 14) - - def test_tmax(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - assert_almost_equal(stats.tmax(x,2.), - stats.mstats.tmax(xm,2.), 10) - assert_almost_equal(stats.tmax(y,2.), - stats.mstats.tmax(ym,2.), 10) - - def test_tmin(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - assert_equal(stats.tmin(x),stats.mstats.tmin(xm)) - assert_equal(stats.tmin(y),stats.mstats.tmin(ym)) - - assert_almost_equal(stats.tmin(x,lowerlimit=-1.), - stats.mstats.tmin(xm,lowerlimit=-1.), 10) - assert_almost_equal(stats.tmin(y,lowerlimit=-1.), - stats.mstats.tmin(ym,lowerlimit=-1.), 10) - - def test_zmap(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - z = stats.zmap(x,y) - zm = stats.mstats.zmap(xm,ym) - assert_allclose(z, zm[0:len(z)], atol=1e-10) - - def test_variation(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - assert_almost_equal(stats.variation(x), stats.mstats.variation(xm), - decimal=12) - assert_almost_equal(stats.variation(y), stats.mstats.variation(ym), - decimal=12) - - def test_tvar(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - assert_almost_equal(stats.tvar(x), stats.mstats.tvar(xm), - decimal=12) - assert_almost_equal(stats.tvar(y), stats.mstats.tvar(ym), - decimal=12) - - def test_trimboth(self): - a = np.arange(20) - b = stats.trimboth(a, 0.1) - bm = stats.mstats.trimboth(a, 0.1) - assert_allclose(b, bm.data[~bm.mask]) - - def test_tsem(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - assert_almost_equal(stats.tsem(x),stats.mstats.tsem(xm), decimal=14) - assert_almost_equal(stats.tsem(y),stats.mstats.tsem(ym), decimal=14) - assert_almost_equal(stats.tsem(x,limits=(-2.,2.)), - stats.mstats.tsem(xm,limits=(-2.,2.)), - decimal=14) - - def test_skewtest(self): - # this test is for 1D data - for n in self.get_n(): - if n > 8: - x, y, xm, ym = self.generate_xy_sample(n) - r = stats.skewtest(x) - rm = stats.mstats.skewtest(xm) - assert_equal(r[0], rm[0]) - # TODO this test is not performed as it is a known issue that - # mstats returns a slightly different p-value what is a bit - # strange is that other tests like test_maskedarray_input don't - # fail! - #~ assert_almost_equal(r[1], rm[1]) - - def test_skewtest_2D_notmasked(self): - # a normal ndarray is passed to the masked function - x = np.random.random((20, 2)) * 20. - r = stats.skewtest(x) - rm = stats.mstats.skewtest(x) - assert_allclose(np.asarray(r), np.asarray(rm)) - - def test_skewtest_2D_WithMask(self): - nx = 2 - for n in self.get_n(): - if n > 8: - x, y, xm, ym = self.generate_xy_sample2D(n, nx) - r = stats.skewtest(x) - rm = stats.mstats.skewtest(xm) - - assert_equal(r[0][0],rm[0][0]) - assert_equal(r[0][1],rm[0][1]) - - def test_normaltest(self): - np.seterr(over='raise') - for n in self.get_n(): - if n > 8: - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=UserWarning) - x, y, xm, ym = self.generate_xy_sample(n) - r = stats.normaltest(x) - rm = stats.mstats.normaltest(xm) - assert_allclose(np.asarray(r), np.asarray(rm)) - - def test_find_repeats(self): - x = np.asarray([1,1,2,2,3,3,3,4,4,4,4]).astype('float') - tmp = np.asarray([1,1,2,2,3,3,3,4,4,4,4,5,5,5,5]).astype('float') - mask = (tmp == 5.) - xm = np.ma.array(tmp, mask=mask) - - r = stats.find_repeats(x) - rm = stats.mstats.find_repeats(xm) - - assert_equal(r,rm) - - def test_kendalltau(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - r = stats.kendalltau(x, y) - rm = stats.mstats.kendalltau(xm, ym) - assert_almost_equal(r[0], rm[0], decimal=10) - assert_almost_equal(r[1], rm[1], decimal=7) - - def test_obrientransform(self): - for n in self.get_n(): - x, y, xm, ym = self.generate_xy_sample(n) - r = stats.obrientransform(x) - rm = stats.mstats.obrientransform(xm) - assert_almost_equal(r.T, rm[0:len(x)]) - - -if __name__ == "__main__": - run_module_suite() diff --git a/wafo/stats/tests/test_mstats_extras.py b/wafo/stats/tests/test_mstats_extras.py deleted file mode 100644 index 2ab5762..0000000 --- a/wafo/stats/tests/test_mstats_extras.py +++ /dev/null @@ -1,107 +0,0 @@ -# pylint: disable-msg=W0611, W0612, W0511,R0201 -"""Tests suite for maskedArray statistics. - -:author: Pierre Gerard-Marchant -:contact: pierregm_at_uga_dot_edu -""" -from __future__ import division, print_function, absolute_import - -__author__ = "Pierre GF Gerard-Marchant ($Author: backtopop $)" - -import numpy as np - -import numpy.ma as ma - -import wafo.stats.mstats as ms -#import wafo.stats.mmorestats as mms - -from numpy.testing import TestCase, run_module_suite, assert_equal, \ - assert_almost_equal, assert_ - - -class TestMisc(TestCase): - - def __init__(self, *args, **kwargs): - TestCase.__init__(self, *args, **kwargs) - - def test_mjci(self): - "Tests the Marits-Jarrett estimator" - data = ma.array([77, 87, 88,114,151,210,219,246,253,262, - 296,299,306,376,428,515,666,1310,2611]) - assert_almost_equal(ms.mjci(data),[55.76819,45.84028,198.87875],5) - - def test_trimmedmeanci(self): - "Tests the confidence intervals of the trimmed mean." - data = ma.array([545,555,558,572,575,576,578,580, - 594,605,635,651,653,661,666]) - assert_almost_equal(ms.trimmed_mean(data,0.2), 596.2, 1) - assert_equal(np.round(ms.trimmed_mean_ci(data,(0.2,0.2)),1), - [561.8, 630.6]) - - def test_idealfourths(self): - "Tests ideal-fourths" - test = np.arange(100) - assert_almost_equal(np.asarray(ms.idealfourths(test)), - [24.416667,74.583333],6) - test_2D = test.repeat(3).reshape(-1,3) - assert_almost_equal(ms.idealfourths(test_2D, axis=0), - [[24.416667,24.416667,24.416667], - [74.583333,74.583333,74.583333]],6) - assert_almost_equal(ms.idealfourths(test_2D, axis=1), - test.repeat(2).reshape(-1,2)) - test = [0,0] - _result = ms.idealfourths(test) - assert_(np.isnan(_result).all()) - -#.............................................................................. - - -class TestQuantiles(TestCase): - - def __init__(self, *args, **kwargs): - TestCase.__init__(self, *args, **kwargs) - - def test_hdquantiles(self): - data = [0.706560797,0.727229578,0.990399276,0.927065621,0.158953014, - 0.887764025,0.239407086,0.349638551,0.972791145,0.149789972, - 0.936947700,0.132359948,0.046041972,0.641675031,0.945530547, - 0.224218684,0.771450991,0.820257774,0.336458052,0.589113496, - 0.509736129,0.696838829,0.491323573,0.622767425,0.775189248, - 0.641461450,0.118455200,0.773029450,0.319280007,0.752229111, - 0.047841438,0.466295911,0.583850781,0.840581845,0.550086491, - 0.466470062,0.504765074,0.226855960,0.362641207,0.891620942, - 0.127898691,0.490094097,0.044882048,0.041441695,0.317976349, - 0.504135618,0.567353033,0.434617473,0.636243375,0.231803616, - 0.230154113,0.160011327,0.819464108,0.854706985,0.438809221, - 0.487427267,0.786907310,0.408367937,0.405534192,0.250444460, - 0.995309248,0.144389588,0.739947527,0.953543606,0.680051621, - 0.388382017,0.863530727,0.006514031,0.118007779,0.924024803, - 0.384236354,0.893687694,0.626534881,0.473051932,0.750134705, - 0.241843555,0.432947602,0.689538104,0.136934797,0.150206859, - 0.474335206,0.907775349,0.525869295,0.189184225,0.854284286, - 0.831089744,0.251637345,0.587038213,0.254475554,0.237781276, - 0.827928620,0.480283781,0.594514455,0.213641488,0.024194386, - 0.536668589,0.699497811,0.892804071,0.093835427,0.731107772] - # - assert_almost_equal(ms.hdquantiles(data,[0., 1.]), - [0.006514031, 0.995309248]) - hdq = ms.hdquantiles(data,[0.25, 0.5, 0.75]) - assert_almost_equal(hdq, [0.253210762, 0.512847491, 0.762232442,]) - hdq = ms.hdquantiles_sd(data,[0.25, 0.5, 0.75]) - assert_almost_equal(hdq, [0.03786954, 0.03805389, 0.03800152,], 4) - # - data = np.array(data).reshape(10,10) - hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0) - assert_almost_equal(hdq[:,0], ms.hdquantiles(data[:,0],[0.25,0.5,0.75])) - assert_almost_equal(hdq[:,-1], ms.hdquantiles(data[:,-1],[0.25,0.5,0.75])) - hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0,var=True) - assert_almost_equal(hdq[...,0], - ms.hdquantiles(data[:,0],[0.25,0.5,0.75],var=True)) - assert_almost_equal(hdq[...,-1], - ms.hdquantiles(data[:,-1],[0.25,0.5,0.75], var=True)) - - -############################################################################### - -if __name__ == "__main__": - run_module_suite() diff --git a/wafo/stats/tests/test_multivariate.py b/wafo/stats/tests/test_multivariate.py deleted file mode 100644 index 63d2a8c..0000000 --- a/wafo/stats/tests/test_multivariate.py +++ /dev/null @@ -1,485 +0,0 @@ -""" -Test functions for multivariate normal distributions. - -""" -from __future__ import division, print_function, absolute_import - -from numpy.testing import ( - assert_allclose, - assert_almost_equal, - assert_array_almost_equal, - assert_equal, - assert_raises, - run_module_suite, -) - -import numpy -import numpy as np - -import scipy.linalg -from wafo.stats._multivariate import _PSD, _lnB -from wafo.stats import multivariate_normal -from wafo.stats import dirichlet, beta -from wafo.stats import norm - -from scipy.integrate import romb - - -def test_input_shape(): - mu = np.arange(3) - cov = np.identity(2) - assert_raises(ValueError, multivariate_normal.pdf, (0, 1), mu, cov) - assert_raises(ValueError, multivariate_normal.pdf, (0, 1, 2), mu, cov) - - -def test_scalar_values(): - np.random.seed(1234) - - # When evaluated on scalar data, the pdf should return a scalar - x, mean, cov = 1.5, 1.7, 2.5 - pdf = multivariate_normal.pdf(x, mean, cov) - assert_equal(pdf.ndim, 0) - - # When evaluated on a single vector, the pdf should return a scalar - x = np.random.randn(5) - mean = np.random.randn(5) - cov = np.abs(np.random.randn(5)) # Diagonal values for cov. matrix - pdf = multivariate_normal.pdf(x, mean, cov) - assert_equal(pdf.ndim, 0) - - -def test_logpdf(): - # Check that the log of the pdf is in fact the logpdf - np.random.seed(1234) - x = np.random.randn(5) - mean = np.random.randn(5) - cov = np.abs(np.random.randn(5)) - d1 = multivariate_normal.logpdf(x, mean, cov) - d2 = multivariate_normal.pdf(x, mean, cov) - assert_allclose(d1, np.log(d2)) - - -def test_rank(): - # Check that the rank is detected correctly. - np.random.seed(1234) - n = 4 - mean = np.random.randn(n) - for expected_rank in range(1, n + 1): - s = np.random.randn(n, expected_rank) - cov = np.dot(s, s.T) - distn = multivariate_normal(mean, cov, allow_singular=True) - assert_equal(distn.cov_info.rank, expected_rank) - - -def _sample_orthonormal_matrix(n): - M = np.random.randn(n, n) - u, s, v = scipy.linalg.svd(M) - return u - - -def test_degenerate_distributions(): - for n in range(1, 5): - x = np.random.randn(n) - for k in range(1, n + 1): - # Sample a small covariance matrix. - s = np.random.randn(k, k) - cov_kk = np.dot(s, s.T) - - # Embed the small covariance matrix into a larger low rank matrix. - cov_nn = np.zeros((n, n)) - cov_nn[:k, :k] = cov_kk - - # Define a rotation of the larger low rank matrix. - u = _sample_orthonormal_matrix(n) - cov_rr = np.dot(u, np.dot(cov_nn, u.T)) - y = np.dot(u, x) - - # Check some identities. - distn_kk = multivariate_normal(np.zeros(k), cov_kk, - allow_singular=True) - distn_nn = multivariate_normal(np.zeros(n), cov_nn, - allow_singular=True) - distn_rr = multivariate_normal(np.zeros(n), cov_rr, - allow_singular=True) - assert_equal(distn_kk.cov_info.rank, k) - assert_equal(distn_nn.cov_info.rank, k) - assert_equal(distn_rr.cov_info.rank, k) - pdf_kk = distn_kk.pdf(x[:k]) - pdf_nn = distn_nn.pdf(x) - pdf_rr = distn_rr.pdf(y) - assert_allclose(pdf_kk, pdf_nn) - assert_allclose(pdf_kk, pdf_rr) - logpdf_kk = distn_kk.logpdf(x[:k]) - logpdf_nn = distn_nn.logpdf(x) - logpdf_rr = distn_rr.logpdf(y) - assert_allclose(logpdf_kk, logpdf_nn) - assert_allclose(logpdf_kk, logpdf_rr) - - -def test_large_pseudo_determinant(): - # Check that large pseudo-determinants are handled appropriately. - - # Construct a singular diagonal covariance matrix - # whose pseudo determinant overflows double precision. - large_total_log = 1000.0 - npos = 100 - nzero = 2 - large_entry = np.exp(large_total_log / npos) - n = npos + nzero - cov = np.zeros((n, n), dtype=float) - np.fill_diagonal(cov, large_entry) - cov[-nzero:, -nzero:] = 0 - - # Check some determinants. - assert_equal(scipy.linalg.det(cov), 0) - assert_equal(scipy.linalg.det(cov[:npos, :npos]), np.inf) - - # np.linalg.slogdet is only available in numpy 1.6+ - # but scipy currently supports numpy 1.5.1. - # assert_allclose(np.linalg.slogdet(cov[:npos, :npos]), - # (1, large_total_log)) - - # Check the pseudo-determinant. - psd = _PSD(cov) - assert_allclose(psd.log_pdet, large_total_log) - - -def test_broadcasting(): - np.random.seed(1234) - n = 4 - - # Construct a random covariance matrix. - data = np.random.randn(n, n) - cov = np.dot(data, data.T) - mean = np.random.randn(n) - - # Construct an ndarray which can be interpreted as - # a 2x3 array whose elements are random data vectors. - X = np.random.randn(2, 3, n) - - # Check that multiple data points can be evaluated at once. - for i in range(2): - for j in range(3): - actual = multivariate_normal.pdf(X[i, j], mean, cov) - desired = multivariate_normal.pdf(X, mean, cov)[i, j] - assert_allclose(actual, desired) - - -def test_normal_1D(): - # The probability density function for a 1D normal variable should - # agree with the standard normal distribution in scipy.stats.distributions - x = np.linspace(0, 2, 10) - mean, cov = 1.2, 0.9 - scale = cov**0.5 - d1 = norm.pdf(x, mean, scale) - d2 = multivariate_normal.pdf(x, mean, cov) - assert_allclose(d1, d2) - - -def test_marginalization(): - # Integrating out one of the variables of a 2D Gaussian should - # yield a 1D Gaussian - mean = np.array([2.5, 3.5]) - cov = np.array([[.5, 0.2], [0.2, .6]]) - n = 2 ** 8 + 1 # Number of samples - delta = 6 / (n - 1) # Grid spacing - - v = np.linspace(0, 6, n) - xv, yv = np.meshgrid(v, v) - pos = np.empty((n, n, 2)) - pos[:, :, 0] = xv - pos[:, :, 1] = yv - pdf = multivariate_normal.pdf(pos, mean, cov) - - # Marginalize over x and y axis - margin_x = romb(pdf, delta, axis=0) - margin_y = romb(pdf, delta, axis=1) - - # Compare with standard normal distribution - gauss_x = norm.pdf(v, loc=mean[0], scale=cov[0, 0] ** 0.5) - gauss_y = norm.pdf(v, loc=mean[1], scale=cov[1, 1] ** 0.5) - assert_allclose(margin_x, gauss_x, rtol=1e-2, atol=1e-2) - assert_allclose(margin_y, gauss_y, rtol=1e-2, atol=1e-2) - - -def test_frozen(): - # The frozen distribution should agree with the regular one - np.random.seed(1234) - x = np.random.randn(5) - mean = np.random.randn(5) - cov = np.abs(np.random.randn(5)) - norm_frozen = multivariate_normal(mean, cov) - assert_allclose(norm_frozen.pdf(x), multivariate_normal.pdf(x, mean, cov)) - assert_allclose(norm_frozen.logpdf(x), - multivariate_normal.logpdf(x, mean, cov)) - - -def test_pseudodet_pinv(): - # Make sure that pseudo-inverse and pseudo-det agree on cutoff - - # Assemble random covariance matrix with large and small eigenvalues - np.random.seed(1234) - n = 7 - x = np.random.randn(n, n) - cov = np.dot(x, x.T) - s, u = scipy.linalg.eigh(cov) - s = 0.5 * np.ones(n) - s[0] = 1.0 - s[-1] = 1e-7 - cov = np.dot(u, np.dot(np.diag(s), u.T)) - - # Set cond so that the lowest eigenvalue is below the cutoff - cond = 1e-5 - psd = _PSD(cov, cond=cond) - psd_pinv = _PSD(psd.pinv, cond=cond) - - # Check that the log pseudo-determinant agrees with the sum - # of the logs of all but the smallest eigenvalue - assert_allclose(psd.log_pdet, np.sum(np.log(s[:-1]))) - # Check that the pseudo-determinant of the pseudo-inverse - # agrees with 1 / pseudo-determinant - assert_allclose(-psd.log_pdet, psd_pinv.log_pdet) - - -def test_exception_nonsquare_cov(): - cov = [[1, 2, 3], [4, 5, 6]] - assert_raises(ValueError, _PSD, cov) - - -def test_exception_nonfinite_cov(): - cov_nan = [[1, 0], [0, np.nan]] - assert_raises(ValueError, _PSD, cov_nan) - cov_inf = [[1, 0], [0, np.inf]] - assert_raises(ValueError, _PSD, cov_inf) - - -def test_exception_non_psd_cov(): - cov = [[1, 0], [0, -1]] - assert_raises(ValueError, _PSD, cov) - - -def test_exception_singular_cov(): - np.random.seed(1234) - x = np.random.randn(5) - mean = np.random.randn(5) - cov = np.ones((5, 5)) - e = np.linalg.LinAlgError - assert_raises(e, multivariate_normal, mean, cov) - assert_raises(e, multivariate_normal.pdf, x, mean, cov) - assert_raises(e, multivariate_normal.logpdf, x, mean, cov) - - -def test_R_values(): - # Compare the multivariate pdf with some values precomputed - # in R version 3.0.1 (2013-05-16) on Mac OS X 10.6. - - # The values below were generated by the following R-script: - # > library(mnormt) - # > x <- seq(0, 2, length=5) - # > y <- 3*x - 2 - # > z <- x + cos(y) - # > mu <- c(1, 3, 2) - # > Sigma <- matrix(c(1,2,0,2,5,0.5,0,0.5,3), 3, 3) - # > r_pdf <- dmnorm(cbind(x,y,z), mu, Sigma) - r_pdf = np.array([0.0002214706, 0.0013819953, 0.0049138692, - 0.0103803050, 0.0140250800]) - - x = np.linspace(0, 2, 5) - y = 3 * x - 2 - z = x + np.cos(y) - r = np.array([x, y, z]).T - - mean = np.array([1, 3, 2], 'd') - cov = np.array([[1, 2, 0], [2, 5, .5], [0, .5, 3]], 'd') - - pdf = multivariate_normal.pdf(r, mean, cov) - assert_allclose(pdf, r_pdf, atol=1e-10) - - -def test_multivariate_normal_rvs_zero_covariance(): - mean = np.zeros(2) - covariance = np.zeros((2, 2)) - model = multivariate_normal(mean, covariance, allow_singular=True) - sample = model.rvs() - assert_equal(sample, [0, 0]) - - -def test_rvs_shape(): - # Check that rvs parses the mean and covariance correctly, and returns - # an array of the right shape - N = 300 - d = 4 - sample = multivariate_normal.rvs(mean=np.zeros(d), cov=1, size=N) - assert_equal(sample.shape, (N, d)) - - sample = multivariate_normal.rvs(mean=None, - cov=np.array([[2, .1], [.1, 1]]), - size=N) - assert_equal(sample.shape, (N, 2)) - - u = multivariate_normal(mean=0, cov=1) - sample = u.rvs(N) - assert_equal(sample.shape, (N, )) - - -def test_large_sample(): - # Generate large sample and compare sample mean and sample covariance - # with mean and covariance matrix. - - np.random.seed(2846) - - n = 3 - mean = np.random.randn(n) - M = np.random.randn(n, n) - cov = np.dot(M, M.T) - size = 5000 - - sample = multivariate_normal.rvs(mean, cov, size) - - assert_allclose(numpy.cov(sample.T), cov, rtol=1e-1) - assert_allclose(sample.mean(0), mean, rtol=1e-1) - - -def test_entropy(): - np.random.seed(2846) - - n = 3 - mean = np.random.randn(n) - M = np.random.randn(n, n) - cov = np.dot(M, M.T) - - rv = multivariate_normal(mean, cov) - - # Check that frozen distribution agrees with entropy function - assert_almost_equal(rv.entropy(), multivariate_normal.entropy(mean, cov)) - # Compare entropy with manually computed expression involving - # the sum of the logs of the eigenvalues of the covariance matrix - eigs = np.linalg.eig(cov)[0] - desired = 1 / 2 * (n * (np.log(2 * np.pi) + 1) + np.sum(np.log(eigs))) - assert_almost_equal(desired, rv.entropy()) - - -def test_lnB(): - alpha = np.array([1, 1, 1]) - desired = .5 # e^lnB = 1/2 for [1, 1, 1] - - assert_almost_equal(np.exp(_lnB(alpha)), desired) - - -def test_frozen_dirichlet(): - np.random.seed(2846) - - n = np.random.randint(1, 32) - alpha = np.random.uniform(10e-10, 100, n) - - d = dirichlet(alpha) - - assert_equal(d.var(), dirichlet.var(alpha)) - assert_equal(d.mean(), dirichlet.mean(alpha)) - assert_equal(d.entropy(), dirichlet.entropy(alpha)) - num_tests = 10 - for i in range(num_tests): - x = np.random.uniform(10e-10, 100, n) - x /= np.sum(x) - assert_equal(d.pdf(x[:-1]), dirichlet.pdf(x[:-1], alpha)) - assert_equal(d.logpdf(x[:-1]), dirichlet.logpdf(x[:-1], alpha)) - - -def test_simple_values(): - alpha = np.array([1, 1]) - d = dirichlet(alpha) - - assert_almost_equal(d.mean(), 0.5) - assert_almost_equal(d.var(), 1. / 12.) - - b = beta(1, 1) - assert_almost_equal(d.mean(), b.mean()) - assert_almost_equal(d.var(), b.var()) - - -def test_K_and_K_minus_1_calls_equal(): - # Test that calls with K and K-1 entries yield the same results. - - np.random.seed(2846) - - n = np.random.randint(1, 32) - alpha = np.random.uniform(10e-10, 100, n) - - d = dirichlet(alpha) - num_tests = 10 - for i in range(num_tests): - x = np.random.uniform(10e-10, 100, n) - x /= np.sum(x) - assert_almost_equal(d.pdf(x[:-1]), d.pdf(x)) - - -def test_multiple_entry_calls(): - # Test that calls with multiple x vectors as matrix work - - np.random.seed(2846) - - n = np.random.randint(1, 32) - alpha = np.random.uniform(10e-10, 100, n) - d = dirichlet(alpha) - - num_tests = 10 - num_multiple = 5 - xm = None - for i in range(num_tests): - for m in range(num_multiple): - x = np.random.uniform(10e-10, 100, n) - x /= np.sum(x) - if xm is not None: - xm = np.vstack((xm, x)) - else: - xm = x - rm = d.pdf(xm.T) - rs = None - for xs in xm: - r = d.pdf(xs) - if rs is not None: - rs = np.append(rs, r) - else: - rs = r - assert_array_almost_equal(rm, rs) - - -def test_2D_dirichlet_is_beta(): - np.random.seed(2846) - - alpha = np.random.uniform(10e-10, 100, 2) - d = dirichlet(alpha) - b = beta(alpha[0], alpha[1]) - - num_tests = 10 - for i in range(num_tests): - x = np.random.uniform(10e-10, 100, 2) - x /= np.sum(x) - assert_almost_equal(b.pdf(x), d.pdf([x])) - - assert_almost_equal(b.mean(), d.mean()[0]) - assert_almost_equal(b.var(), d.var()[0]) - - -def test_dimensions_mismatch(): - # Regression test for GH #3493. Check that setting up a PDF with a mean of - # length M and a covariance matrix of size (N, N), where M != N, raises a - # ValueError with an informative error message. - - mu = np.array([0.0, 0.0]) - sigma = np.array([[1.0]]) - - assert_raises(ValueError, multivariate_normal, mu, sigma) - - # A simple check that the right error message was passed along. Checking - # that the entire message is there, word for word, would be somewhat - # fragile, so we just check for the leading part. - try: - multivariate_normal(mu, sigma) - except ValueError as e: - msg = "Dimension mismatch" - assert_equal(str(e)[:len(msg)], msg) - - -if __name__ == "__main__": - run_module_suite() diff --git a/wafo/stats/tests/test_rank.py b/wafo/stats/tests/test_rank.py deleted file mode 100644 index 10e0dc1..0000000 --- a/wafo/stats/tests/test_rank.py +++ /dev/null @@ -1,193 +0,0 @@ -from __future__ import division, print_function, absolute_import - -import numpy as np -from numpy.testing import TestCase, run_module_suite, assert_equal, \ - assert_array_equal - -from wafo.stats import rankdata, tiecorrect - - -class TestTieCorrect(TestCase): - - def test_empty(self): - """An empty array requires no correction, should return 1.0.""" - ranks = np.array([], dtype=np.float64) - c = tiecorrect(ranks) - assert_equal(c, 1.0) - - def test_one(self): - """A single element requires no correction, should return 1.0.""" - ranks = np.array([1.0], dtype=np.float64) - c = tiecorrect(ranks) - assert_equal(c, 1.0) - - def test_no_correction(self): - """Arrays with no ties require no correction.""" - ranks = np.arange(2.0) - c = tiecorrect(ranks) - assert_equal(c, 1.0) - ranks = np.arange(3.0) - c = tiecorrect(ranks) - assert_equal(c, 1.0) - - def test_basic(self): - """Check a few basic examples of the tie correction factor.""" - # One tie of two elements - ranks = np.array([1.0, 2.5, 2.5]) - c = tiecorrect(ranks) - T = 2.0 - N = ranks.size - expected = 1.0 - (T**3 - T) / (N**3 - N) - assert_equal(c, expected) - - # One tie of two elements (same as above, but tie is not at the end) - ranks = np.array([1.5, 1.5, 3.0]) - c = tiecorrect(ranks) - T = 2.0 - N = ranks.size - expected = 1.0 - (T**3 - T) / (N**3 - N) - assert_equal(c, expected) - - # One tie of three elements - ranks = np.array([1.0, 3.0, 3.0, 3.0]) - c = tiecorrect(ranks) - T = 3.0 - N = ranks.size - expected = 1.0 - (T**3 - T) / (N**3 - N) - assert_equal(c, expected) - - # Two ties, lengths 2 and 3. - ranks = np.array([1.5, 1.5, 4.0, 4.0, 4.0]) - c = tiecorrect(ranks) - T1 = 2.0 - T2 = 3.0 - N = ranks.size - expected = 1.0 - ((T1**3 - T1) + (T2**3 - T2)) / (N**3 - N) - assert_equal(c, expected) - - -class TestRankData(TestCase): - - def test_empty(self): - """stats.rankdata([]) should return an empty array.""" - a = np.array([], dtype=np.int) - r = rankdata(a) - assert_array_equal(r, np.array([], dtype=np.float64)) - r = rankdata([]) - assert_array_equal(r, np.array([], dtype=np.float64)) - - def test_one(self): - """Check stats.rankdata with an array of length 1.""" - data = [100] - a = np.array(data, dtype=np.int) - r = rankdata(a) - assert_array_equal(r, np.array([1.0], dtype=np.float64)) - r = rankdata(data) - assert_array_equal(r, np.array([1.0], dtype=np.float64)) - - def test_basic(self): - """Basic tests of stats.rankdata.""" - data = [100, 10, 50] - expected = np.array([3.0, 1.0, 2.0], dtype=np.float64) - a = np.array(data, dtype=np.int) - r = rankdata(a) - assert_array_equal(r, expected) - r = rankdata(data) - assert_array_equal(r, expected) - - data = [40, 10, 30, 10, 50] - expected = np.array([4.0, 1.5, 3.0, 1.5, 5.0], dtype=np.float64) - a = np.array(data, dtype=np.int) - r = rankdata(a) - assert_array_equal(r, expected) - r = rankdata(data) - assert_array_equal(r, expected) - - data = [20, 20, 20, 10, 10, 10] - expected = np.array([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=np.float64) - a = np.array(data, dtype=np.int) - r = rankdata(a) - assert_array_equal(r, expected) - r = rankdata(data) - assert_array_equal(r, expected) - # The docstring states explicitly that the argument is flattened. - a2d = a.reshape(2, 3) - r = rankdata(a2d) - assert_array_equal(r, expected) - - def test_large_int(self): - data = np.array([2**60, 2**60+1], dtype=np.uint64) - r = rankdata(data) - assert_array_equal(r, [1.0, 2.0]) - - data = np.array([2**60, 2**60+1], dtype=np.int64) - r = rankdata(data) - assert_array_equal(r, [1.0, 2.0]) - - data = np.array([2**60, -2**60+1], dtype=np.int64) - r = rankdata(data) - assert_array_equal(r, [2.0, 1.0]) - - def test_big_tie(self): - for n in [10000, 100000, 1000000]: - data = np.ones(n, dtype=int) - r = rankdata(data) - expected_rank = 0.5 * (n + 1) - assert_array_equal(r, expected_rank * data, - "test failed with n=%d" % n) - - -_cases = ( - # values, method, expected - ([], 'average', []), - ([], 'min', []), - ([], 'max', []), - ([], 'dense', []), - ([], 'ordinal', []), - # - ([100], 'average', [1.0]), - ([100], 'min', [1.0]), - ([100], 'max', [1.0]), - ([100], 'dense', [1.0]), - ([100], 'ordinal', [1.0]), - # - ([100, 100, 100], 'average', [2.0, 2.0, 2.0]), - ([100, 100, 100], 'min', [1.0, 1.0, 1.0]), - ([100, 100, 100], 'max', [3.0, 3.0, 3.0]), - ([100, 100, 100], 'dense', [1.0, 1.0, 1.0]), - ([100, 100, 100], 'ordinal', [1.0, 2.0, 3.0]), - # - ([100, 300, 200], 'average', [1.0, 3.0, 2.0]), - ([100, 300, 200], 'min', [1.0, 3.0, 2.0]), - ([100, 300, 200], 'max', [1.0, 3.0, 2.0]), - ([100, 300, 200], 'dense', [1.0, 3.0, 2.0]), - ([100, 300, 200], 'ordinal', [1.0, 3.0, 2.0]), - # - ([100, 200, 300, 200], 'average', [1.0, 2.5, 4.0, 2.5]), - ([100, 200, 300, 200], 'min', [1.0, 2.0, 4.0, 2.0]), - ([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]), - ([100, 200, 300, 200], 'dense', [1.0, 2.0, 3.0, 2.0]), - ([100, 200, 300, 200], 'ordinal', [1.0, 2.0, 4.0, 3.0]), - # - ([100, 200, 300, 200, 100], 'average', [1.5, 3.5, 5.0, 3.5, 1.5]), - ([100, 200, 300, 200, 100], 'min', [1.0, 3.0, 5.0, 3.0, 1.0]), - ([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]), - ([100, 200, 300, 200, 100], 'dense', [1.0, 2.0, 3.0, 2.0, 1.0]), - ([100, 200, 300, 200, 100], 'ordinal', [1.0, 3.0, 5.0, 4.0, 2.0]), - # - ([10] * 30, 'ordinal', np.arange(1.0, 31.0)), -) - - -def test_cases(): - - def check_case(values, method, expected): - r = rankdata(values, method=method) - assert_array_equal(r, expected) - - for values, method, expected in _cases: - yield check_case, values, method, expected - - -if __name__ == "__main__": - run_module_suite() diff --git a/wafo/stats/tests/test_stats.py b/wafo/stats/tests/test_stats.py deleted file mode 100644 index 8f87fbb..0000000 --- a/wafo/stats/tests/test_stats.py +++ /dev/null @@ -1,2830 +0,0 @@ -""" Test functions for stats module - - WRITTEN BY LOUIS LUANGKESORN FOR THE STATS MODULE - BASED ON WILKINSON'S STATISTICS QUIZ - http://www.stanford.edu/~clint/bench/wilk.txt - - Additional tests by a host of SciPy developers. -""" -from __future__ import division, print_function, absolute_import - -import sys -import warnings -from collections import namedtuple - -from numpy.testing import (TestCase, assert_, assert_equal, - assert_almost_equal, assert_array_almost_equal, - assert_array_equal, assert_approx_equal, - assert_raises, run_module_suite, assert_allclose, - dec) -import numpy.ma.testutils as mat -from numpy import array, arange, float32, float64, power -import numpy as np - -import wafo.stats as stats - - -""" Numbers in docstrings beginning with 'W' refer to the section numbers - and headings found in the STATISTICS QUIZ of Leland Wilkinson. These are - considered to be essential functionality. True testing and - evaluation of a statistics package requires use of the - NIST Statistical test data. See McCoullough(1999) Assessing The Reliability - of Statistical Software for a test methodology and its - implementation in testing SAS, SPSS, and S-Plus -""" - -# Datasets -# These data sets are from the nasty.dat sets used by Wilkinson -# For completeness, I should write the relevant tests and count them as failures -# Somewhat acceptable, since this is still beta software. It would count as a -# good target for 1.0 status -X = array([1,2,3,4,5,6,7,8,9], float) -ZERO = array([0,0,0,0,0,0,0,0,0], float) -BIG = array([99999991,99999992,99999993,99999994,99999995,99999996,99999997, - 99999998,99999999], float) -LITTLE = array([0.99999991,0.99999992,0.99999993,0.99999994,0.99999995,0.99999996, - 0.99999997,0.99999998,0.99999999], float) -HUGE = array([1e+12,2e+12,3e+12,4e+12,5e+12,6e+12,7e+12,8e+12,9e+12], float) -TINY = array([1e-12,2e-12,3e-12,4e-12,5e-12,6e-12,7e-12,8e-12,9e-12], float) -ROUND = array([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5], float) - - -class TestTrimmedStats(TestCase): - # TODO: write these tests to handle missing values properly - dprec = np.finfo(np.float64).precision - - def test_tmean(self): - y = stats.tmean(X, (2, 8), (True, True)) - assert_approx_equal(y, 5.0, significant=self.dprec) - - y1 = stats.tmean(X, limits=(2, 8), inclusive=(False, False)) - y2 = stats.tmean(X, limits=None) - assert_approx_equal(y1, y2, significant=self.dprec) - - def test_tvar(self): - y = stats.tvar(X, limits=(2, 8), inclusive=(True, True)) - assert_approx_equal(y, 4.6666666666666661, significant=self.dprec) - - y = stats.tvar(X, limits=None) - assert_approx_equal(y, X.var(ddof=1), significant=self.dprec) - - def test_tstd(self): - y = stats.tstd(X, (2, 8), (True, True)) - assert_approx_equal(y, 2.1602468994692865, significant=self.dprec) - - y = stats.tstd(X, limits=None) - assert_approx_equal(y, X.std(ddof=1), significant=self.dprec) - - def test_tmin(self): - x = np.arange(10) - assert_equal(stats.tmin(x), 0) - assert_equal(stats.tmin(x, lowerlimit=0), 0) - assert_equal(stats.tmin(x, lowerlimit=0, inclusive=False), 1) - - x = x.reshape((5, 2)) - assert_equal(stats.tmin(x, lowerlimit=0, inclusive=False), [2, 1]) - assert_equal(stats.tmin(x, axis=1), [0, 2, 4, 6, 8]) - assert_equal(stats.tmin(x, axis=None), 0) - - def test_tmax(self): - x = np.arange(10) - assert_equal(stats.tmax(x), 9) - assert_equal(stats.tmax(x, upperlimit=9),9) - assert_equal(stats.tmax(x, upperlimit=9, inclusive=False), 8) - - x = x.reshape((5, 2)) - assert_equal(stats.tmax(x, upperlimit=9, inclusive=False), [8, 7]) - assert_equal(stats.tmax(x, axis=1), [1, 3, 5, 7, 9]) - assert_equal(stats.tmax(x, axis=None), 9) - - def test_tsem(self): - y = stats.tsem(X, limits=(3, 8), inclusive=(False, True)) - y_ref = np.array([4, 5, 6, 7, 8]) - assert_approx_equal(y, y_ref.std(ddof=1) / np.sqrt(y_ref.size), - significant=self.dprec) - - assert_approx_equal(stats.tsem(X, limits=[-1, 10]), - stats.tsem(X, limits=None), - significant=self.dprec) - - -class TestNanFunc(TestCase): - def __init__(self, *args, **kw): - TestCase.__init__(self, *args, **kw) - self.X = X.copy() - - self.Xall = X.copy() - self.Xall[:] = np.nan - - self.Xsome = X.copy() - self.Xsomet = X.copy() - self.Xsome[0] = np.nan - self.Xsomet = self.Xsomet[1:] - - def test_nanmean_none(self): - # Check nanmean when no values are nan. - m = stats.nanmean(X) - assert_approx_equal(m, X[4]) - - def test_nanmean_some(self): - # Check nanmean when some values only are nan. - m = stats.nanmean(self.Xsome) - assert_approx_equal(m, 5.5) - - def test_nanmean_all(self): - # Check nanmean when all values are nan. - olderr = np.seterr(all='ignore') - try: - m = stats.nanmean(self.Xall) - finally: - np.seterr(**olderr) - assert_(np.isnan(m)) - - def test_nanstd_none(self): - # Check nanstd when no values are nan. - s = stats.nanstd(self.X) - assert_approx_equal(s, np.std(self.X, ddof=1)) - - def test_nanstd_some(self): - # Check nanstd when some values only are nan. - s = stats.nanstd(self.Xsome) - assert_approx_equal(s, np.std(self.Xsomet, ddof=1)) - - def test_nanstd_all(self): - # Check nanstd when all values are nan. - olderr = np.seterr(all='ignore') - try: - s = stats.nanstd(self.Xall) - finally: - np.seterr(**olderr) - assert_(np.isnan(s)) - - def test_nanstd_bias_kw(self): - s = stats.nanstd(self.X, bias=True) - assert_approx_equal(s, np.std(self.X, ddof=0)) - - def test_nanstd_negative_axis(self): - x = np.array([1, 2, 3]) - assert_equal(stats.nanstd(x, -1), 1) - - def test_nanmedian_none(self): - # Check nanmedian when no values are nan. - m = stats.nanmedian(self.X) - assert_approx_equal(m, np.median(self.X)) - - def test_nanmedian_axis(self): - # Check nanmedian with axis - X = self.X.reshape(3,3) - m = stats.nanmedian(X, axis=0) - assert_equal(m, np.median(X, axis=0)) - m = stats.nanmedian(X, axis=1) - assert_equal(m, np.median(X, axis=1)) - - def test_nanmedian_some(self): - # Check nanmedian when some values only are nan. - m = stats.nanmedian(self.Xsome) - assert_approx_equal(m, np.median(self.Xsomet)) - - def test_nanmedian_all(self): - # Check nanmedian when all values are nan. - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - m = stats.nanmedian(self.Xall) - assert_(np.isnan(m)) - assert_equal(len(w), 1) - assert_(issubclass(w[0].category, RuntimeWarning)) - - def test_nanmedian_all_axis(self): - # Check nanmedian when all values are nan. - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - m = stats.nanmedian(self.Xall.reshape(3,3), axis=1) - assert_(np.isnan(m).all()) - assert_equal(len(w), 3) - assert_(issubclass(w[0].category, RuntimeWarning)) - - def test_nanmedian_scalars(self): - # Check nanmedian for scalar inputs. See ticket #1098. - assert_equal(stats.nanmedian(1), np.median(1)) - assert_equal(stats.nanmedian(True), np.median(True)) - assert_equal(stats.nanmedian(np.array(1)), np.median(np.array(1))) - assert_equal(stats.nanmedian(np.nan), np.median(np.nan)) - - -class TestCorrPearsonr(TestCase): - """ W.II.D. Compute a correlation matrix on all the variables. - - All the correlations, except for ZERO and MISS, shoud be exactly 1. - ZERO and MISS should have undefined or missing correlations with the - other variables. The same should go for SPEARMAN corelations, if - your program has them. - """ - def test_pXX(self): - y = stats.pearsonr(X,X) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pXBIG(self): - y = stats.pearsonr(X,BIG) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pXLITTLE(self): - y = stats.pearsonr(X,LITTLE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pXHUGE(self): - y = stats.pearsonr(X,HUGE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pXTINY(self): - y = stats.pearsonr(X,TINY) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pXROUND(self): - y = stats.pearsonr(X,ROUND) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pBIGBIG(self): - y = stats.pearsonr(BIG,BIG) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pBIGLITTLE(self): - y = stats.pearsonr(BIG,LITTLE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pBIGHUGE(self): - y = stats.pearsonr(BIG,HUGE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pBIGTINY(self): - y = stats.pearsonr(BIG,TINY) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pBIGROUND(self): - y = stats.pearsonr(BIG,ROUND) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pLITTLELITTLE(self): - y = stats.pearsonr(LITTLE,LITTLE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pLITTLEHUGE(self): - y = stats.pearsonr(LITTLE,HUGE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pLITTLETINY(self): - y = stats.pearsonr(LITTLE,TINY) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pLITTLEROUND(self): - y = stats.pearsonr(LITTLE,ROUND) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pHUGEHUGE(self): - y = stats.pearsonr(HUGE,HUGE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pHUGETINY(self): - y = stats.pearsonr(HUGE,TINY) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pHUGEROUND(self): - y = stats.pearsonr(HUGE,ROUND) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pTINYTINY(self): - y = stats.pearsonr(TINY,TINY) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pTINYROUND(self): - y = stats.pearsonr(TINY,ROUND) - r = y[0] - assert_approx_equal(r,1.0) - - def test_pROUNDROUND(self): - y = stats.pearsonr(ROUND,ROUND) - r = y[0] - assert_approx_equal(r,1.0) - - def test_r_exactly_pos1(self): - a = arange(3.0) - b = a - r, prob = stats.pearsonr(a,b) - assert_equal(r, 1.0) - assert_equal(prob, 0.0) - - def test_r_exactly_neg1(self): - a = arange(3.0) - b = -a - r, prob = stats.pearsonr(a,b) - assert_equal(r, -1.0) - assert_equal(prob, 0.0) - - def test_basic(self): - # A basic test, with a correlation coefficient - # that is not 1 or -1. - a = array([-1, 0, 1]) - b = array([0, 0, 3]) - r, prob = stats.pearsonr(a, b) - assert_approx_equal(r, np.sqrt(3)/2) - assert_approx_equal(prob, 1.0/3) - - -class TestFisherExact(TestCase): - """Some tests to show that fisher_exact() works correctly. - - Note that in SciPy 0.9.0 this was not working well for large numbers due to - inaccuracy of the hypergeom distribution (see #1218). Fixed now. - - Also note that R and Scipy have different argument formats for their - hypergeometric distribution functions. - - R: - > phyper(18999, 99000, 110000, 39000, lower.tail = FALSE) - [1] 1.701815e-09 - """ - def test_basic(self): - fisher_exact = stats.fisher_exact - - res = fisher_exact([[14500, 20000], [30000, 40000]])[1] - assert_approx_equal(res, 0.01106, significant=4) - res = fisher_exact([[100, 2], [1000, 5]])[1] - assert_approx_equal(res, 0.1301, significant=4) - res = fisher_exact([[2, 7], [8, 2]])[1] - assert_approx_equal(res, 0.0230141, significant=6) - res = fisher_exact([[5, 1], [10, 10]])[1] - assert_approx_equal(res, 0.1973244, significant=6) - res = fisher_exact([[5, 15], [20, 20]])[1] - assert_approx_equal(res, 0.0958044, significant=6) - res = fisher_exact([[5, 16], [20, 25]])[1] - assert_approx_equal(res, 0.1725862, significant=6) - res = fisher_exact([[10, 5], [10, 1]])[1] - assert_approx_equal(res, 0.1973244, significant=6) - res = fisher_exact([[5, 0], [1, 4]])[1] - assert_approx_equal(res, 0.04761904, significant=6) - res = fisher_exact([[0, 1], [3, 2]])[1] - assert_approx_equal(res, 1.0) - res = fisher_exact([[0, 2], [6, 4]])[1] - assert_approx_equal(res, 0.4545454545) - res = fisher_exact([[2, 7], [8, 2]]) - assert_approx_equal(res[1], 0.0230141, significant=6) - assert_approx_equal(res[0], 4.0 / 56) - - def test_precise(self): - # results from R - # - # R defines oddsratio differently (see Notes section of fisher_exact - # docstring), so those will not match. We leave them in anyway, in - # case they will be useful later on. We test only the p-value. - tablist = [ - ([[100, 2], [1000, 5]], (2.505583993422285e-001, 1.300759363430016e-001)), - ([[2, 7], [8, 2]], (8.586235135736206e-002, 2.301413756522114e-002)), - ([[5, 1], [10, 10]], (4.725646047336584e+000, 1.973244147157190e-001)), - ([[5, 15], [20, 20]], (3.394396617440852e-001, 9.580440012477637e-002)), - ([[5, 16], [20, 25]], (3.960558326183334e-001, 1.725864953812994e-001)), - ([[10, 5], [10, 1]], (2.116112781158483e-001, 1.973244147157190e-001)), - ([[10, 5], [10, 0]], (0.000000000000000e+000, 6.126482213438734e-002)), - ([[5, 0], [1, 4]], (np.inf, 4.761904761904762e-002)), - ([[0, 5], [1, 4]], (0.000000000000000e+000, 1.000000000000000e+000)), - ([[5, 1], [0, 4]], (np.inf, 4.761904761904758e-002)), - ([[0, 1], [3, 2]], (0.000000000000000e+000, 1.000000000000000e+000)) - ] - for table, res_r in tablist: - res = stats.fisher_exact(np.asarray(table)) - np.testing.assert_almost_equal(res[1], res_r[1], decimal=11, - verbose=True) - - @dec.slow - def test_large_numbers(self): - # Test with some large numbers. Regression test for #1401 - pvals = [5.56e-11, 2.666e-11, 1.363e-11] # from R - for pval, num in zip(pvals, [75, 76, 77]): - res = stats.fisher_exact([[17704, 496], [1065, num]])[1] - assert_approx_equal(res, pval, significant=4) - - res = stats.fisher_exact([[18000, 80000], [20000, 90000]])[1] - assert_approx_equal(res, 0.2751, significant=4) - - def test_raises(self): - # test we raise an error for wrong shape of input. - assert_raises(ValueError, stats.fisher_exact, - np.arange(6).reshape(2, 3)) - - def test_row_or_col_zero(self): - tables = ([[0, 0], [5, 10]], - [[5, 10], [0, 0]], - [[0, 5], [0, 10]], - [[5, 0], [10, 0]]) - for table in tables: - oddsratio, pval = stats.fisher_exact(table) - assert_equal(pval, 1.0) - assert_equal(oddsratio, np.nan) - - def test_less_greater(self): - tables = ( - # Some tables to compare with R: - [[2, 7], [8, 2]], - [[200, 7], [8, 300]], - [[28, 21], [6, 1957]], - [[190, 800], [200, 900]], - # Some tables with simple exact values - # (includes regression test for ticket #1568): - [[0, 2], [3, 0]], - [[1, 1], [2, 1]], - [[2, 0], [1, 2]], - [[0, 1], [2, 3]], - [[1, 0], [1, 4]], - ) - pvals = ( - # from R: - [0.018521725952066501, 0.9990149169715733], - [1.0, 2.0056578803889148e-122], - [1.0, 5.7284374608319831e-44], - [0.7416227, 0.2959826], - # Exact: - [0.1, 1.0], - [0.7, 0.9], - [1.0, 0.3], - [2./3, 1.0], - [1.0, 1./3], - ) - for table, pval in zip(tables, pvals): - res = [] - res.append(stats.fisher_exact(table, alternative="less")[1]) - res.append(stats.fisher_exact(table, alternative="greater")[1]) - assert_allclose(res, pval, atol=0, rtol=1e-7) - - def test_gh3014(self): - # check if issue #3014 has been fixed. - # before, this would have risen a ValueError - odds, pvalue = stats.fisher_exact([[1, 2], [9, 84419233]]) - - -class TestCorrSpearmanr(TestCase): - """ W.II.D. Compute a correlation matrix on all the variables. - - All the correlations, except for ZERO and MISS, shoud be exactly 1. - ZERO and MISS should have undefined or missing correlations with the - other variables. The same should go for SPEARMAN corelations, if - your program has them. - """ - def test_sXX(self): - y = stats.spearmanr(X,X) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sXBIG(self): - y = stats.spearmanr(X,BIG) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sXLITTLE(self): - y = stats.spearmanr(X,LITTLE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sXHUGE(self): - y = stats.spearmanr(X,HUGE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sXTINY(self): - y = stats.spearmanr(X,TINY) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sXROUND(self): - y = stats.spearmanr(X,ROUND) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sBIGBIG(self): - y = stats.spearmanr(BIG,BIG) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sBIGLITTLE(self): - y = stats.spearmanr(BIG,LITTLE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sBIGHUGE(self): - y = stats.spearmanr(BIG,HUGE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sBIGTINY(self): - y = stats.spearmanr(BIG,TINY) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sBIGROUND(self): - y = stats.spearmanr(BIG,ROUND) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sLITTLELITTLE(self): - y = stats.spearmanr(LITTLE,LITTLE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sLITTLEHUGE(self): - y = stats.spearmanr(LITTLE,HUGE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sLITTLETINY(self): - y = stats.spearmanr(LITTLE,TINY) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sLITTLEROUND(self): - y = stats.spearmanr(LITTLE,ROUND) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sHUGEHUGE(self): - y = stats.spearmanr(HUGE,HUGE) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sHUGETINY(self): - y = stats.spearmanr(HUGE,TINY) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sHUGEROUND(self): - y = stats.spearmanr(HUGE,ROUND) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sTINYTINY(self): - y = stats.spearmanr(TINY,TINY) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sTINYROUND(self): - y = stats.spearmanr(TINY,ROUND) - r = y[0] - assert_approx_equal(r,1.0) - - def test_sROUNDROUND(self): - y = stats.spearmanr(ROUND,ROUND) - r = y[0] - assert_approx_equal(r,1.0) - - -class TestCorrSpearmanrTies(TestCase): - """Some tests of tie-handling by the spearmanr function.""" - - def test_tie1(self): - # Data - x = [1.0, 2.0, 3.0, 4.0] - y = [1.0, 2.0, 2.0, 3.0] - # Ranks of the data, with tie-handling. - xr = [1.0, 2.0, 3.0, 4.0] - yr = [1.0, 2.5, 2.5, 4.0] - # Result of spearmanr should be the same as applying - # pearsonr to the ranks. - sr = stats.spearmanr(x, y) - pr = stats.pearsonr(xr, yr) - assert_almost_equal(sr, pr) - - -## W.II.E. Tabulate X against X, using BIG as a case weight. The values -## should appear on the diagonal and the total should be 899999955. -## If the table cannot hold these values, forget about working with -## census data. You can also tabulate HUGE against TINY. There is no -## reason a tabulation program should not be able to distinguish -## different values regardless of their magnitude. - -### I need to figure out how to do this one. - - -def test_kendalltau(): - # with some ties - x1 = [12, 2, 1, 12, 2] - x2 = [1, 4, 7, 1, 0] - expected = (-0.47140452079103173, 0.24821309157521476) - res = stats.kendalltau(x1, x2) - assert_approx_equal(res[0], expected[0]) - assert_approx_equal(res[1], expected[1]) - - # with only ties in one or both inputs - assert_equal(stats.kendalltau([2,2,2], [2,2,2]), (np.nan, np.nan)) - assert_equal(stats.kendalltau([2,0,2], [2,2,2]), (np.nan, np.nan)) - assert_equal(stats.kendalltau([2,2,2], [2,0,2]), (np.nan, np.nan)) - - # empty arrays provided as input - assert_equal(stats.kendalltau([], []), (np.nan, np.nan)) - - # check two different sort methods - assert_approx_equal(stats.kendalltau(x1, x2, initial_lexsort=False)[1], - stats.kendalltau(x1, x2, initial_lexsort=True)[1]) - - # and with larger arrays - np.random.seed(7546) - x = np.array([np.random.normal(loc=1, scale=1, size=500), - np.random.normal(loc=1, scale=1, size=500)]) - corr = [[1.0, 0.3], - [0.3, 1.0]] - x = np.dot(np.linalg.cholesky(corr), x) - expected = (0.19291382765531062, 1.1337108207276285e-10) - res = stats.kendalltau(x[0], x[1]) - assert_approx_equal(res[0], expected[0]) - assert_approx_equal(res[1], expected[1]) - - # and do we get a tau of 1 for identical inputs? - assert_approx_equal(stats.kendalltau([1,1,2], [1,1,2])[0], 1.0) - - -class TestRegression(TestCase): - def test_linregressBIGX(self): - # W.II.F. Regress BIG on X. - # The constant should be 99999990 and the regression coefficient should be 1. - y = stats.linregress(X,BIG) - intercept = y[1] - r = y[2] - assert_almost_equal(intercept,99999990) - assert_almost_equal(r,1.0) - - def test_regressXX(self): - # W.IV.B. Regress X on X. - # The constant should be exactly 0 and the regression coefficient should be 1. - # This is a perfectly valid regression. The program should not complain. - y = stats.linregress(X,X) - intercept = y[1] - r = y[2] - assert_almost_equal(intercept,0.0) - assert_almost_equal(r,1.0) -## W.IV.C. Regress X on BIG and LITTLE (two predictors). The program -## should tell you that this model is "singular" because BIG and -## LITTLE are linear combinations of each other. Cryptic error -## messages are unacceptable here. Singularity is the most -## fundamental regression error. -### Need to figure out how to handle multiple linear regression. Not obvious - - def test_regressZEROX(self): - # W.IV.D. Regress ZERO on X. - # The program should inform you that ZERO has no variance or it should - # go ahead and compute the regression and report a correlation and - # total sum of squares of exactly 0. - y = stats.linregress(X,ZERO) - intercept = y[1] - r = y[2] - assert_almost_equal(intercept,0.0) - assert_almost_equal(r,0.0) - - def test_regress_simple(self): - # Regress a line with sinusoidal noise. - x = np.linspace(0, 100, 100) - y = 0.2 * np.linspace(0, 100, 100) + 10 - y += np.sin(np.linspace(0, 20, 100)) - - res = stats.linregress(x, y) - assert_almost_equal(res[4], 2.3957814497838803e-3) - - def test_regress_simple_onearg_rows(self): - # Regress a line w sinusoidal noise, with a single input of shape (2, N). - x = np.linspace(0, 100, 100) - y = 0.2 * np.linspace(0, 100, 100) + 10 - y += np.sin(np.linspace(0, 20, 100)) - rows = np.vstack((x, y)) - - res = stats.linregress(rows) - assert_almost_equal(res[4], 2.3957814497838803e-3) - - def test_regress_simple_onearg_cols(self): - x = np.linspace(0, 100, 100) - y = 0.2 * np.linspace(0, 100, 100) + 10 - y += np.sin(np.linspace(0, 20, 100)) - cols = np.hstack((np.expand_dims(x, 1), np.expand_dims(y, 1))) - - res = stats.linregress(cols) - assert_almost_equal(res[4], 2.3957814497838803e-3) - - def test_regress_shape_error(self): - # Check that a single input argument to linregress with wrong shape - # results in a ValueError. - assert_raises(ValueError, stats.linregress, np.ones((3, 3))) - - def test_linregress(self): - # compared with multivariate ols with pinv - x = np.arange(11) - y = np.arange(5,16) - y[[(1),(-2)]] -= 1 - y[[(0),(-1)]] += 1 - - res = (1.0, 5.0, 0.98229948625750, 7.45259691e-008, 0.063564172616372733) - assert_array_almost_equal(stats.linregress(x,y),res,decimal=14) - - def test_regress_simple_negative_cor(self): - # If the slope of the regression is negative the factor R tend to -1 not 1. - # Sometimes rounding errors makes it < -1 leading to stderr being NaN - a, n = 1e-71, 100000 - x = np.linspace(a, 2 * a, n) - y = np.linspace(2 * a, a, n) - stats.linregress(x, y) - res = stats.linregress(x, y) - assert_(res[2] >= -1) # propagated numerical errors were not corrected - assert_almost_equal(res[2], -1) # perfect negative correlation case - assert_(not np.isnan(res[4])) # stderr should stay finite - - -def test_theilslopes(): - # Basic slope test. - slope, intercept, lower, upper = stats.theilslopes([0,1,1]) - assert_almost_equal(slope, 0.5) - assert_almost_equal(intercept, 0.5) - - # Test of confidence intervals. - x = [1, 2, 3, 4, 10, 12, 18] - y = [9, 15, 19, 20, 45, 55, 78] - slope, intercept, lower, upper = stats.theilslopes(y, x, 0.07) - assert_almost_equal(slope, 4) - assert_almost_equal(upper, 4.38, decimal=2) - assert_almost_equal(lower, 3.71, decimal=2) - - -class TestHistogram(TestCase): - # Tests that histogram works as it should, and keeps old behaviour - # - # what is untested: - # - multidimensional arrays (since 'a' is ravel'd as the first line in the method) - # - very large arrays - # - Nans, Infs, empty and otherwise bad inputs - - # sample arrays to test the histogram with - low_values = np.array([0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.7, 0.8, 0.9, 1.1, 1.2], - dtype=float) # 11 values - high_range = np.array([2, 3, 4, 2, 21, 32, 78, 95, 65, 66, 66, 66, 66, 4], - dtype=float) # 14 values - low_range = np.array([2, 3, 3, 2, 3, 2.4, 2.1, 3.1, 2.9, 2.6, 2.7, 2.8, 2.2, 2.001], - dtype=float) # 14 values - few_values = np.array([2.0, 3.0, -1.0, 0.0], dtype=float) # 4 values - - def test_simple(self): - # Tests that each of the tests works as expected with default params - # - # basic tests, with expected results (no weighting) - # results taken from the previous (slower) version of histogram - basic_tests = ((self.low_values, (np.array([1., 1., 1., 2., 2., - 1., 1., 0., 1., 1.]), - 0.14444444444444446, 0.11111111111111112, 0)), - (self.high_range, (np.array([5., 0., 1., 1., 0., - 0., 5., 1., 0., 1.]), - -3.1666666666666661, 10.333333333333332, 0)), - (self.low_range, (np.array([3., 1., 1., 1., 0., 1., - 1., 2., 3., 1.]), - 1.9388888888888889, 0.12222222222222223, 0)), - (self.few_values, (np.array([1., 0., 1., 0., 0., 0., - 0., 1., 0., 1.]), - -1.2222222222222223, 0.44444444444444448, 0)), - ) - for inputs, expected_results in basic_tests: - given_results = stats.histogram(inputs) - assert_array_almost_equal(expected_results[0], given_results[0], - decimal=2) - for i in range(1, 4): - assert_almost_equal(expected_results[i], given_results[i], - decimal=2) - - def test_weighting(self): - # Tests that weights give expected histograms - - # basic tests, with expected results, given a set of weights - # weights used (first n are used for each test, where n is len of array) (14 values) - weights = np.array([1., 3., 4.5, 0.1, -1.0, 0.0, 0.3, 7.0, 103.2, 2, 40, 0, 0, 1]) - # results taken from the numpy version of histogram - basic_tests = ((self.low_values, (np.array([4.0, 0.0, 4.5, -0.9, 0.0, - 0.3,110.2, 0.0, 0.0, 42.0]), - 0.2, 0.1, 0)), - (self.high_range, (np.array([9.6, 0., -1., 0., 0., - 0.,145.2, 0., 0.3, 7.]), - 2.0, 9.3, 0)), - (self.low_range, (np.array([2.4, 0., 0., 0., 0., - 2., 40., 0., 103.2, 13.5]), - 2.0, 0.11, 0)), - (self.few_values, (np.array([4.5, 0., 0.1, 0., 0., 0., - 0., 1., 0., 3.]), - -1., 0.4, 0)), - - ) - for inputs, expected_results in basic_tests: - # use the first lot of weights for test - # default limits given to reproduce output of numpy's test better - given_results = stats.histogram(inputs, defaultlimits=(inputs.min(), - inputs.max()), - weights=weights[:len(inputs)]) - assert_array_almost_equal(expected_results[0], given_results[0], - decimal=2) - for i in range(1, 4): - assert_almost_equal(expected_results[i], given_results[i], - decimal=2) - - def test_reduced_bins(self): - # Tests that reducing the number of bins produces expected results - - # basic tests, with expected results (no weighting), - # except number of bins is halved to 5 - # results taken from the previous (slower) version of histogram - basic_tests = ((self.low_values, (np.array([2., 3., 3., 1., 2.]), - 0.075000000000000011, 0.25, 0)), - (self.high_range, (np.array([5., 2., 0., 6., 1.]), - -9.625, 23.25, 0)), - (self.low_range, (np.array([4., 2., 1., 3., 4.]), - 1.8625, 0.27500000000000002, 0)), - (self.few_values, (np.array([1., 1., 0., 1., 1.]), - -1.5, 1.0, 0)), - ) - for inputs, expected_results in basic_tests: - given_results = stats.histogram(inputs, numbins=5) - assert_array_almost_equal(expected_results[0], given_results[0], - decimal=2) - for i in range(1, 4): - assert_almost_equal(expected_results[i], given_results[i], - decimal=2) - - def test_increased_bins(self): - # Tests that increasing the number of bins produces expected results - - # basic tests, with expected results (no weighting), - # except number of bins is double to 20 - # results taken from the previous (slower) version of histogram - basic_tests = ((self.low_values, (np.array([1., 0., 1., 0., 1., - 0., 2., 0., 1., 0., - 1., 1., 0., 1., 0., - 0., 0., 1., 0., 1.]), - 0.1736842105263158, 0.052631578947368418, 0)), - (self.high_range, (np.array([5., 0., 0., 0., 1., - 0., 1., 0., 0., 0., - 0., 0., 0., 5., 0., - 0., 1., 0., 0., 1.]), - -0.44736842105263142, 4.8947368421052628, 0)), - (self.low_range, (np.array([3., 0., 1., 1., 0., 0., - 0., 1., 0., 0., 1., 0., - 1., 0., 1., 0., 1., 3., - 0., 1.]), - 1.9710526315789474, 0.057894736842105263, 0)), - (self.few_values, (np.array([1., 0., 0., 0., 0., 1., - 0., 0., 0., 0., 0., 0., - 0., 0., 1., 0., 0., 0., - 0., 1.]), - -1.1052631578947367, 0.21052631578947367, 0)), - ) - for inputs, expected_results in basic_tests: - given_results = stats.histogram(inputs, numbins=20) - assert_array_almost_equal(expected_results[0], given_results[0], - decimal=2) - for i in range(1, 4): - assert_almost_equal(expected_results[i], given_results[i], - decimal=2) - - -def test_cumfreq(): - x = [1, 4, 2, 1, 3, 1] - cumfreqs, lowlim, binsize, extrapoints = stats.cumfreq(x, numbins=4) - assert_array_almost_equal(cumfreqs, np.array([3., 4., 5., 6.])) - cumfreqs, lowlim, binsize, extrapoints = stats.cumfreq(x, numbins=4, - defaultreallimits=(1.5, 5)) - assert_(extrapoints == 3) - - -def test_relfreq(): - a = np.array([1, 4, 2, 1, 3, 1]) - relfreqs, lowlim, binsize, extrapoints = stats.relfreq(a, numbins=4) - assert_array_almost_equal(relfreqs, - array([0.5, 0.16666667, 0.16666667, 0.16666667])) - - # check array_like input is accepted - relfreqs2, lowlim, binsize, extrapoints = stats.relfreq([1, 4, 2, 1, 3, 1], - numbins=4) - assert_array_almost_equal(relfreqs, relfreqs2) - - -class TestGMean(TestCase): - - def test_1D_list(self): - a = (1,2,3,4) - actual = stats.gmean(a) - desired = power(1*2*3*4,1./4.) - assert_almost_equal(actual, desired,decimal=14) - - desired1 = stats.gmean(a,axis=-1) - assert_almost_equal(actual, desired1, decimal=14) - - def test_1D_array(self): - a = array((1,2,3,4), float32) - actual = stats.gmean(a) - desired = power(1*2*3*4,1./4.) - assert_almost_equal(actual, desired, decimal=7) - - desired1 = stats.gmean(a,axis=-1) - assert_almost_equal(actual, desired1, decimal=7) - - def test_2D_array_default(self): - a = array(((1,2,3,4), - (1,2,3,4), - (1,2,3,4))) - actual = stats.gmean(a) - desired = array((1,2,3,4)) - assert_array_almost_equal(actual, desired, decimal=14) - - desired1 = stats.gmean(a,axis=0) - assert_array_almost_equal(actual, desired1, decimal=14) - - def test_2D_array_dim1(self): - a = array(((1,2,3,4), - (1,2,3,4), - (1,2,3,4))) - actual = stats.gmean(a, axis=1) - v = power(1*2*3*4,1./4.) - desired = array((v,v,v)) - assert_array_almost_equal(actual, desired, decimal=14) - - def test_large_values(self): - a = array([1e100, 1e200, 1e300]) - actual = stats.gmean(a) - assert_approx_equal(actual, 1e200, significant=14) - - -class TestHMean(TestCase): - def test_1D_list(self): - a = (1,2,3,4) - actual = stats.hmean(a) - desired = 4. / (1./1 + 1./2 + 1./3 + 1./4) - assert_almost_equal(actual, desired, decimal=14) - - desired1 = stats.hmean(array(a),axis=-1) - assert_almost_equal(actual, desired1, decimal=14) - - def test_1D_array(self): - a = array((1,2,3,4), float64) - actual = stats.hmean(a) - desired = 4. / (1./1 + 1./2 + 1./3 + 1./4) - assert_almost_equal(actual, desired, decimal=14) - - desired1 = stats.hmean(a,axis=-1) - assert_almost_equal(actual, desired1, decimal=14) - - def test_2D_array_default(self): - a = array(((1,2,3,4), - (1,2,3,4), - (1,2,3,4))) - actual = stats.hmean(a) - desired = array((1.,2.,3.,4.)) - assert_array_almost_equal(actual, desired, decimal=14) - - actual1 = stats.hmean(a,axis=0) - assert_array_almost_equal(actual1, desired, decimal=14) - - def test_2D_array_dim1(self): - a = array(((1,2,3,4), - (1,2,3,4), - (1,2,3,4))) - - v = 4. / (1./1 + 1./2 + 1./3 + 1./4) - desired1 = array((v,v,v)) - actual1 = stats.hmean(a, axis=1) - assert_array_almost_equal(actual1, desired1, decimal=14) - - -class TestScoreatpercentile(TestCase): - def setUp(self): - self.a1 = [3, 4, 5, 10, -3, -5, 6] - self.a2 = [3, -6, -2, 8, 7, 4, 2, 1] - self.a3 = [3., 4, 5, 10, -3, -5, -6, 7.0] - - def test_basic(self): - x = arange(8) * 0.5 - assert_equal(stats.scoreatpercentile(x, 0), 0.) - assert_equal(stats.scoreatpercentile(x, 100), 3.5) - assert_equal(stats.scoreatpercentile(x, 50), 1.75) - - def test_2D(self): - x = array([[1, 1, 1], - [1, 1, 1], - [4, 4, 3], - [1, 1, 1], - [1, 1, 1]]) - assert_array_equal(stats.scoreatpercentile(x, 50), [1, 1, 1]) - - def test_fraction(self): - scoreatperc = stats.scoreatpercentile - - # Test defaults - assert_equal(scoreatperc(list(range(10)), 50), 4.5) - assert_equal(scoreatperc(list(range(10)), 50, (2,7)), 4.5) - assert_equal(scoreatperc(list(range(100)), 50, limit=(1, 8)), 4.5) - assert_equal(scoreatperc(np.array([1, 10,100]), 50, (10,100)), 55) - assert_equal(scoreatperc(np.array([1, 10,100]), 50, (1,10)), 5.5) - - # explicitly specify interpolation_method 'fraction' (the default) - assert_equal(scoreatperc(list(range(10)), 50, interpolation_method='fraction'), - 4.5) - assert_equal(scoreatperc(list(range(10)), 50, limit=(2, 7), - interpolation_method='fraction'), - 4.5) - assert_equal(scoreatperc(list(range(100)), 50, limit=(1, 8), - interpolation_method='fraction'), - 4.5) - assert_equal(scoreatperc(np.array([1, 10,100]), 50, (10, 100), - interpolation_method='fraction'), - 55) - assert_equal(scoreatperc(np.array([1, 10,100]), 50, (1,10), - interpolation_method='fraction'), - 5.5) - - def test_lower_higher(self): - scoreatperc = stats.scoreatpercentile - - # interpolation_method 'lower'/'higher' - assert_equal(scoreatperc(list(range(10)), 50, - interpolation_method='lower'), 4) - assert_equal(scoreatperc(list(range(10)), 50, - interpolation_method='higher'), 5) - assert_equal(scoreatperc(list(range(10)), 50, (2,7), - interpolation_method='lower'), 4) - assert_equal(scoreatperc(list(range(10)), 50, limit=(2,7), - interpolation_method='higher'), 5) - assert_equal(scoreatperc(list(range(100)), 50, (1,8), - interpolation_method='lower'), 4) - assert_equal(scoreatperc(list(range(100)), 50, (1,8), - interpolation_method='higher'), 5) - assert_equal(scoreatperc(np.array([1, 10, 100]), 50, (10, 100), - interpolation_method='lower'), 10) - assert_equal(scoreatperc(np.array([1, 10, 100]), 50, limit=(10, 100), - interpolation_method='higher'), 100) - assert_equal(scoreatperc(np.array([1, 10, 100]), 50, (1, 10), - interpolation_method='lower'), 1) - assert_equal(scoreatperc(np.array([1, 10, 100]), 50, limit=(1, 10), - interpolation_method='higher'), 10) - - def test_sequence_per(self): - x = arange(8) * 0.5 - expected = np.array([0, 3.5, 1.75]) - res = stats.scoreatpercentile(x, [0, 100, 50]) - assert_allclose(res, expected) - assert_(isinstance(res, np.ndarray)) - # Test with ndarray. Regression test for gh-2861 - assert_allclose(stats.scoreatpercentile(x, np.array([0, 100, 50])), - expected) - # Also test combination of 2-D array, axis not None and array-like per - res2 = stats.scoreatpercentile(np.arange(12).reshape((3,4)), - np.array([0, 1, 100, 100]), axis=1) - expected2 = array([[0, 4, 8], - [0.03, 4.03, 8.03], - [3, 7, 11], - [3, 7, 11]]) - assert_allclose(res2, expected2) - - def test_axis(self): - scoreatperc = stats.scoreatpercentile - x = arange(12).reshape(3, 4) - - assert_equal(scoreatperc(x, (25, 50, 100)), [2.75, 5.5, 11.0]) - - r0 = [[2, 3, 4, 5], [4, 5, 6, 7], [8, 9, 10, 11]] - assert_equal(scoreatperc(x, (25, 50, 100), axis=0), r0) - - r1 = [[0.75, 4.75, 8.75], [1.5, 5.5, 9.5], [3, 7, 11]] - assert_equal(scoreatperc(x, (25, 50, 100), axis=1), r1) - - def test_exception(self): - assert_raises(ValueError, stats.scoreatpercentile, [1, 2], 56, - interpolation_method='foobar') - assert_raises(ValueError, stats.scoreatpercentile, [1], 101) - assert_raises(ValueError, stats.scoreatpercentile, [1], -1) - - def test_empty(self): - assert_equal(stats.scoreatpercentile([], 50), np.nan) - assert_equal(stats.scoreatpercentile(np.array([[], []]), 50), np.nan) - assert_equal(stats.scoreatpercentile([], [50, 99]), [np.nan, np.nan]) - - -class TestItemfreq(object): - a = [5, 7, 1, 2, 1, 5, 7] * 10 - b = [1, 2, 5, 7] - - def test_numeric_types(self): - # Check itemfreq works for all dtypes (adapted from np.unique tests) - def _check_itemfreq(dt): - a = np.array(self.a, dt) - v = stats.itemfreq(a) - assert_array_equal(v[:, 0], [1, 2, 5, 7]) - assert_array_equal(v[:, 1], np.array([20, 10, 20, 20], dtype=dt)) - - dtypes = [np.int32, np.int64, np.float32, np.float64, - np.complex64, np.complex128] - for dt in dtypes: - yield _check_itemfreq, dt - - def test_object_arrays(self): - a, b = self.a, self.b - dt = 'O' - aa = np.empty(len(a), dt) - aa[:] = a - bb = np.empty(len(b), dt) - bb[:] = b - v = stats.itemfreq(aa) - assert_array_equal(v[:, 0], bb) - - def test_structured_arrays(self): - a, b = self.a, self.b - dt = [('', 'i'), ('', 'i')] - aa = np.array(list(zip(a, a)), dt) - bb = np.array(list(zip(b, b)), dt) - v = stats.itemfreq(aa) - # Arrays don't compare equal because v[:,0] is object array - assert_equal(tuple(v[2, 0]), tuple(bb[2])) - - -class TestMode(TestCase): - def test_basic(self): - data1 = [3,5,1,10,23,3,2,6,8,6,10,6] - vals = stats.mode(data1) - assert_almost_equal(vals[0][0],6) - assert_almost_equal(vals[1][0],3) - - def test_axes(self): - data1 = [10,10,30,40] - data2 = [10,10,10,10] - data3 = [20,10,20,20] - data4 = [30,30,30,30] - data5 = [40,30,30,30] - arr = np.array([data1, data2, data3, data4, data5]) - - vals = stats.mode(arr, axis=None) - assert_almost_equal(vals[0],np.array([30])) - assert_almost_equal(vals[1],np.array([8])) - - vals = stats.mode(arr, axis=0) - assert_almost_equal(vals[0],np.array([[10,10,30,30]])) - assert_almost_equal(vals[1],np.array([[2,3,3,2]])) - - vals = stats.mode(arr, axis=1) - assert_almost_equal(vals[0],np.array([[10],[10],[20],[30],[30]])) - assert_almost_equal(vals[1],np.array([[2],[4],[3],[4],[3]])) - - def test_strings(self): - data1 = ['rain', 'showers', 'showers'] - vals = stats.mode(data1) - expected = ['showers'] - assert_equal(vals[0][0], 'showers') - assert_equal(vals[1][0], 2) - - @dec.knownfailureif(sys.version_info > (3,), 'numpy github issue 641') - def test_mixed_objects(self): - objects = [10, True, np.nan, 'hello', 10] - arr = np.empty((5,), dtype=object) - arr[:] = objects - vals = stats.mode(arr) - assert_equal(vals[0][0], 10) - assert_equal(vals[1][0], 2) - - def test_objects(self): - """Python objects must be sortable (le + eq) and have ne defined - for np.unique to work. hash is for set. - """ - class Point(object): - def __init__(self, x): - self.x = x - - def __eq__(self, other): - return self.x == other.x - - def __ne__(self, other): - return self.x != other.x - - def __lt__(self, other): - return self.x < other.x - - def __hash__(self): - return hash(self.x) - - points = [Point(x) for x in [1,2,3,4,3,2,2,2]] - arr = np.empty((8,), dtype=object) - arr[:] = points - assert len(set(points)) == 4 - assert_equal(np.unique(arr).shape, (4,)) - vals = stats.mode(arr) - assert_equal(vals[0][0], Point(2)) - assert_equal(vals[1][0], 4) - - -class TestVariability(TestCase): - - testcase = [1,2,3,4] - - def test_signaltonoise(self): - # This is not in R, so used: - # mean(testcase, axis=0) / (sqrt(var(testcase) * 3/4)) - - # y = stats.signaltonoise(self.shoes[0]) - # assert_approx_equal(y,4.5709967) - y = stats.signaltonoise(self.testcase) - assert_approx_equal(y,2.236067977) - - def test_sem(self): - # This is not in R, so used: - # sqrt(var(testcase)*3/4)/sqrt(3) - - # y = stats.sem(self.shoes[0]) - # assert_approx_equal(y,0.775177399) - y = stats.sem(self.testcase) - assert_approx_equal(y, 0.6454972244) - n = len(self.testcase) - assert_allclose(stats.sem(self.testcase, ddof=0) * np.sqrt(n/(n-2)), - stats.sem(self.testcase, ddof=2)) - - def test_zmap(self): - # not in R, so tested by using: - # (testcase[i] - mean(testcase, axis=0)) / sqrt(var(testcase) * 3/4) - y = stats.zmap(self.testcase,self.testcase) - desired = ([-1.3416407864999, -0.44721359549996, 0.44721359549996, 1.3416407864999]) - assert_array_almost_equal(desired,y,decimal=12) - - def test_zmap_axis(self): - # Test use of 'axis' keyword in zmap. - x = np.array([[0.0, 0.0, 1.0, 1.0], - [1.0, 1.0, 1.0, 2.0], - [2.0, 0.0, 2.0, 0.0]]) - - t1 = 1.0/np.sqrt(2.0/3) - t2 = np.sqrt(3.)/3 - t3 = np.sqrt(2.) - - z0 = stats.zmap(x, x, axis=0) - z1 = stats.zmap(x, x, axis=1) - - z0_expected = [[-t1, -t3/2, -t3/2, 0.0], - [0.0, t3, -t3/2, t1], - [t1, -t3/2, t3, -t1]] - z1_expected = [[-1.0, -1.0, 1.0, 1.0], - [-t2, -t2, -t2, np.sqrt(3.)], - [1.0, -1.0, 1.0, -1.0]] - - assert_array_almost_equal(z0, z0_expected) - assert_array_almost_equal(z1, z1_expected) - - def test_zmap_ddof(self): - # Test use of 'ddof' keyword in zmap. - x = np.array([[0.0, 0.0, 1.0, 1.0], - [0.0, 1.0, 2.0, 3.0]]) - - z = stats.zmap(x, x, axis=1, ddof=1) - - z0_expected = np.array([-0.5, -0.5, 0.5, 0.5])/(1.0/np.sqrt(3)) - z1_expected = np.array([-1.5, -0.5, 0.5, 1.5])/(np.sqrt(5./3)) - assert_array_almost_equal(z[0], z0_expected) - assert_array_almost_equal(z[1], z1_expected) - - def test_zscore(self): - # not in R, so tested by using: - # (testcase[i] - mean(testcase, axis=0)) / sqrt(var(testcase) * 3/4) - y = stats.zscore(self.testcase) - desired = ([-1.3416407864999, -0.44721359549996, 0.44721359549996, 1.3416407864999]) - assert_array_almost_equal(desired,y,decimal=12) - - def test_zscore_axis(self): - # Test use of 'axis' keyword in zscore. - x = np.array([[0.0, 0.0, 1.0, 1.0], - [1.0, 1.0, 1.0, 2.0], - [2.0, 0.0, 2.0, 0.0]]) - - t1 = 1.0/np.sqrt(2.0/3) - t2 = np.sqrt(3.)/3 - t3 = np.sqrt(2.) - - z0 = stats.zscore(x, axis=0) - z1 = stats.zscore(x, axis=1) - - z0_expected = [[-t1, -t3/2, -t3/2, 0.0], - [0.0, t3, -t3/2, t1], - [t1, -t3/2, t3, -t1]] - z1_expected = [[-1.0, -1.0, 1.0, 1.0], - [-t2, -t2, -t2, np.sqrt(3.)], - [1.0, -1.0, 1.0, -1.0]] - - assert_array_almost_equal(z0, z0_expected) - assert_array_almost_equal(z1, z1_expected) - - def test_zscore_ddof(self): - # Test use of 'ddof' keyword in zscore. - x = np.array([[0.0, 0.0, 1.0, 1.0], - [0.0, 1.0, 2.0, 3.0]]) - - z = stats.zscore(x, axis=1, ddof=1) - - z0_expected = np.array([-0.5, -0.5, 0.5, 0.5])/(1.0/np.sqrt(3)) - z1_expected = np.array([-1.5, -0.5, 0.5, 1.5])/(np.sqrt(5./3)) - assert_array_almost_equal(z[0], z0_expected) - assert_array_almost_equal(z[1], z1_expected) - - -class TestMoments(TestCase): - """ - Comparison numbers are found using R v.1.5.1 - note that length(testcase) = 4 - testmathworks comes from documentation for the - Statistics Toolbox for Matlab and can be found at both - http://www.mathworks.com/access/helpdesk/help/toolbox/stats/kurtosis.shtml - http://www.mathworks.com/access/helpdesk/help/toolbox/stats/skewness.shtml - Note that both test cases came from here. - """ - testcase = [1,2,3,4] - testmathworks = [1.165, 0.6268, 0.0751, 0.3516, -0.6965] - - def test_moment(self): - # mean((testcase-mean(testcase))**power,axis=0),axis=0))**power)) - y = stats.moment(self.testcase,1) - assert_approx_equal(y,0.0,10) - y = stats.moment(self.testcase,2) - assert_approx_equal(y,1.25) - y = stats.moment(self.testcase,3) - assert_approx_equal(y,0.0) - y = stats.moment(self.testcase,4) - assert_approx_equal(y,2.5625) - - def test_variation(self): - # variation = samplestd / mean - y = stats.variation(self.testcase) - assert_approx_equal(y,0.44721359549996, 10) - - def test_skewness(self): - # sum((testmathworks-mean(testmathworks,axis=0))**3,axis=0) / - # ((sqrt(var(testmathworks)*4/5))**3)/5 - y = stats.skew(self.testmathworks) - assert_approx_equal(y,-0.29322304336607,10) - y = stats.skew(self.testmathworks,bias=0) - assert_approx_equal(y,-0.437111105023940,10) - y = stats.skew(self.testcase) - assert_approx_equal(y,0.0,10) - - def test_skewness_scalar(self): - # `skew` must return a scalar for 1-dim input - assert_equal(stats.skew(arange(10)), 0.0) - - def test_kurtosis(self): - # sum((testcase-mean(testcase,axis=0))**4,axis=0)/((sqrt(var(testcase)*3/4))**4)/4 - # sum((test2-mean(testmathworks,axis=0))**4,axis=0)/((sqrt(var(testmathworks)*4/5))**4)/5 - # Set flags for axis = 0 and - # fisher=0 (Pearson's defn of kurtosis for compatiability with Matlab) - y = stats.kurtosis(self.testmathworks,0,fisher=0,bias=1) - assert_approx_equal(y, 2.1658856802973,10) - - # Note that MATLAB has confusing docs for the following case - # kurtosis(x,0) gives an unbiased estimate of Pearson's skewness - # kurtosis(x) gives a biased estimate of Fisher's skewness (Pearson-3) - # The MATLAB docs imply that both should give Fisher's - y = stats.kurtosis(self.testmathworks,fisher=0,bias=0) - assert_approx_equal(y, 3.663542721189047,10) - y = stats.kurtosis(self.testcase,0,0) - assert_approx_equal(y,1.64) - - def test_kurtosis_array_scalar(self): - assert_equal(type(stats.kurtosis([1,2,3])), float) - - -class TestThreshold(TestCase): - def test_basic(self): - a = [-1,2,3,4,5,-1,-2] - assert_array_equal(stats.threshold(a),a) - assert_array_equal(stats.threshold(a,3,None,0), - [0,0,3,4,5,0,0]) - assert_array_equal(stats.threshold(a,None,3,0), - [-1,2,3,0,0,-1,-2]) - assert_array_equal(stats.threshold(a,2,4,0), - [0,2,3,4,0,0,0]) - - -class TestStudentTest(TestCase): - X1 = np.array([-1, 0, 1]) - X2 = np.array([0, 1, 2]) - T1_0 = 0 - P1_0 = 1 - T1_1 = -1.732051 - P1_1 = 0.2254033 - T1_2 = -3.464102 - P1_2 = 0.0741799 - T2_0 = 1.732051 - P2_0 = 0.2254033 - - def test_onesample(self): - t, p = stats.ttest_1samp(self.X1, 0) - - assert_array_almost_equal(t, self.T1_0) - assert_array_almost_equal(p, self.P1_0) - - t, p = stats.ttest_1samp(self.X2, 0) - - assert_array_almost_equal(t, self.T2_0) - assert_array_almost_equal(p, self.P2_0) - - t, p = stats.ttest_1samp(self.X1, 1) - - assert_array_almost_equal(t, self.T1_1) - assert_array_almost_equal(p, self.P1_1) - - t, p = stats.ttest_1samp(self.X1, 2) - - assert_array_almost_equal(t, self.T1_2) - assert_array_almost_equal(p, self.P1_2) - - -def test_percentileofscore(): - pcos = stats.percentileofscore - - assert_equal(pcos([1,2,3,4,5,6,7,8,9,10],4), 40.0) - - for (kind, result) in [('mean', 35.0), - ('strict', 30.0), - ('weak', 40.0)]: - yield assert_equal, pcos(np.arange(10) + 1, - 4, kind=kind), \ - result - - # multiple - 2 - for (kind, result) in [('rank', 45.0), - ('strict', 30.0), - ('weak', 50.0), - ('mean', 40.0)]: - yield assert_equal, pcos([1,2,3,4,4,5,6,7,8,9], - 4, kind=kind), \ - result - - # multiple - 3 - assert_equal(pcos([1,2,3,4,4,4,5,6,7,8], 4), 50.0) - for (kind, result) in [('rank', 50.0), - ('mean', 45.0), - ('strict', 30.0), - ('weak', 60.0)]: - - yield assert_equal, pcos([1,2,3,4,4,4,5,6,7,8], - 4, kind=kind), \ - result - - # missing - for kind in ('rank', 'mean', 'strict', 'weak'): - yield assert_equal, pcos([1,2,3,5,6,7,8,9,10,11], - 4, kind=kind), \ - 30 - - # larger numbers - for (kind, result) in [('mean', 35.0), - ('strict', 30.0), - ('weak', 40.0)]: - yield assert_equal, \ - pcos([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 40, - kind=kind), result - - for (kind, result) in [('mean', 45.0), - ('strict', 30.0), - ('weak', 60.0)]: - yield assert_equal, \ - pcos([10, 20, 30, 40, 40, 40, 50, 60, 70, 80], - 40, kind=kind), result - - for kind in ('rank', 'mean', 'strict', 'weak'): - yield assert_equal, \ - pcos([10, 20, 30, 50, 60, 70, 80, 90, 100, 110], - 40, kind=kind), 30.0 - - # boundaries - for (kind, result) in [('rank', 10.0), - ('mean', 5.0), - ('strict', 0.0), - ('weak', 10.0)]: - yield assert_equal, \ - pcos([10, 20, 30, 50, 60, 70, 80, 90, 100, 110], - 10, kind=kind), result - - for (kind, result) in [('rank', 100.0), - ('mean', 95.0), - ('strict', 90.0), - ('weak', 100.0)]: - yield assert_equal, \ - pcos([10, 20, 30, 50, 60, 70, 80, 90, 100, 110], - 110, kind=kind), result - - # out of bounds - for (kind, score, result) in [('rank', 200, 100.0), - ('mean', 200, 100.0), - ('mean', 0, 0.0)]: - yield assert_equal, \ - pcos([10, 20, 30, 50, 60, 70, 80, 90, 100, 110], - score, kind=kind), result - - assert_raises(ValueError, pcos, [1, 2, 3, 3, 4], 3, kind='unrecognized') - - -PowerDivCase = namedtuple('Case', ['f_obs', 'f_exp', 'ddof', 'axis', - 'chi2', # Pearson's - 'log', # G-test (log-likelihood) - 'mod_log', # Modified log-likelihood - 'cr', # Cressie-Read (lambda=2/3) - ]) - -# The details of the first two elements in power_div_1d_cases are used -# in a test in TestPowerDivergence. Check that code before making -# any changes here. -power_div_1d_cases = [ - # Use the default f_exp. - PowerDivCase(f_obs=[4, 8, 12, 8], f_exp=None, ddof=0, axis=None, - chi2=4, - log=2*(4*np.log(4/8) + 12*np.log(12/8)), - mod_log=2*(8*np.log(8/4) + 8*np.log(8/12)), - cr=(4*((4/8)**(2/3) - 1) + 12*((12/8)**(2/3) - 1))/(5/9)), - # Give a non-uniform f_exp. - PowerDivCase(f_obs=[4, 8, 12, 8], f_exp=[2, 16, 12, 2], ddof=0, axis=None, - chi2=24, - log=2*(4*np.log(4/2) + 8*np.log(8/16) + 8*np.log(8/2)), - mod_log=2*(2*np.log(2/4) + 16*np.log(16/8) + 2*np.log(2/8)), - cr=(4*((4/2)**(2/3) - 1) + 8*((8/16)**(2/3) - 1) + - 8*((8/2)**(2/3) - 1))/(5/9)), - # f_exp is a scalar. - PowerDivCase(f_obs=[4, 8, 12, 8], f_exp=8, ddof=0, axis=None, - chi2=4, - log=2*(4*np.log(4/8) + 12*np.log(12/8)), - mod_log=2*(8*np.log(8/4) + 8*np.log(8/12)), - cr=(4*((4/8)**(2/3) - 1) + 12*((12/8)**(2/3) - 1))/(5/9)), - # f_exp equal to f_obs. - PowerDivCase(f_obs=[3, 5, 7, 9], f_exp=[3, 5, 7, 9], ddof=0, axis=0, - chi2=0, log=0, mod_log=0, cr=0), -] - - -power_div_empty_cases = [ - # Shape is (0,)--a data set with length 0. The computed - # test statistic should be 0. - PowerDivCase(f_obs=[], - f_exp=None, ddof=0, axis=0, - chi2=0, log=0, mod_log=0, cr=0), - # Shape is (0, 3). This is 3 data sets, but each data set has - # length 0, so the computed test statistic should be [0, 0, 0]. - PowerDivCase(f_obs=np.array([[],[],[]]).T, - f_exp=None, ddof=0, axis=0, - chi2=[0, 0, 0], - log=[0, 0, 0], - mod_log=[0, 0, 0], - cr=[0, 0, 0]), - # Shape is (3, 0). This represents an empty collection of - # data sets in which each data set has length 3. The test - # statistic should be an empty array. - PowerDivCase(f_obs=np.array([[],[],[]]), - f_exp=None, ddof=0, axis=0, - chi2=[], - log=[], - mod_log=[], - cr=[]), -] - - -class TestPowerDivergence(object): - - def check_power_divergence(self, f_obs, f_exp, ddof, axis, lambda_, - expected_stat): - f_obs = np.asarray(f_obs) - if axis is None: - num_obs = f_obs.size - else: - b = np.broadcast(f_obs, f_exp) - num_obs = b.shape[axis] - stat, p = stats.power_divergence(f_obs=f_obs, f_exp=f_exp, ddof=ddof, - axis=axis, lambda_=lambda_) - assert_allclose(stat, expected_stat) - - if lambda_ == 1 or lambda_ == "pearson": - # Also test stats.chisquare. - stat, p = stats.chisquare(f_obs=f_obs, f_exp=f_exp, ddof=ddof, - axis=axis) - assert_allclose(stat, expected_stat) - - ddof = np.asarray(ddof) - expected_p = stats.chisqprob(expected_stat, num_obs - 1 - ddof) - assert_allclose(p, expected_p) - - def test_basic(self): - for case in power_div_1d_cases: - yield (self.check_power_divergence, - case.f_obs, case.f_exp, case.ddof, case.axis, - None, case.chi2) - yield (self.check_power_divergence, - case.f_obs, case.f_exp, case.ddof, case.axis, - "pearson", case.chi2) - yield (self.check_power_divergence, - case.f_obs, case.f_exp, case.ddof, case.axis, - 1, case.chi2) - yield (self.check_power_divergence, - case.f_obs, case.f_exp, case.ddof, case.axis, - "log-likelihood", case.log) - yield (self.check_power_divergence, - case.f_obs, case.f_exp, case.ddof, case.axis, - "mod-log-likelihood", case.mod_log) - yield (self.check_power_divergence, - case.f_obs, case.f_exp, case.ddof, case.axis, - "cressie-read", case.cr) - yield (self.check_power_divergence, - case.f_obs, case.f_exp, case.ddof, case.axis, - 2/3, case.cr) - - def test_basic_masked(self): - for case in power_div_1d_cases: - mobs = np.ma.array(case.f_obs) - yield (self.check_power_divergence, - mobs, case.f_exp, case.ddof, case.axis, - None, case.chi2) - yield (self.check_power_divergence, - mobs, case.f_exp, case.ddof, case.axis, - "pearson", case.chi2) - yield (self.check_power_divergence, - mobs, case.f_exp, case.ddof, case.axis, - 1, case.chi2) - yield (self.check_power_divergence, - mobs, case.f_exp, case.ddof, case.axis, - "log-likelihood", case.log) - yield (self.check_power_divergence, - mobs, case.f_exp, case.ddof, case.axis, - "mod-log-likelihood", case.mod_log) - yield (self.check_power_divergence, - mobs, case.f_exp, case.ddof, case.axis, - "cressie-read", case.cr) - yield (self.check_power_divergence, - mobs, case.f_exp, case.ddof, case.axis, - 2/3, case.cr) - - def test_axis(self): - case0 = power_div_1d_cases[0] - case1 = power_div_1d_cases[1] - f_obs = np.vstack((case0.f_obs, case1.f_obs)) - f_exp = np.vstack((np.ones_like(case0.f_obs)*np.mean(case0.f_obs), - case1.f_exp)) - # Check the four computational code paths in power_divergence - # using a 2D array with axis=1. - yield (self.check_power_divergence, - f_obs, f_exp, 0, 1, - "pearson", [case0.chi2, case1.chi2]) - yield (self.check_power_divergence, - f_obs, f_exp, 0, 1, - "log-likelihood", [case0.log, case1.log]) - yield (self.check_power_divergence, - f_obs, f_exp, 0, 1, - "mod-log-likelihood", [case0.mod_log, case1.mod_log]) - yield (self.check_power_divergence, - f_obs, f_exp, 0, 1, - "cressie-read", [case0.cr, case1.cr]) - # Reshape case0.f_obs to shape (2,2), and use axis=None. - # The result should be the same. - yield (self.check_power_divergence, - np.array(case0.f_obs).reshape(2, 2), None, 0, None, - "pearson", case0.chi2) - - def test_ddof_broadcasting(self): - # Test that ddof broadcasts correctly. - # ddof does not affect the test statistic. It is broadcast - # with the computed test statistic for the computation of - # the p value. - - case0 = power_div_1d_cases[0] - case1 = power_div_1d_cases[1] - # Create 4x2 arrays of observed and expected frequencies. - f_obs = np.vstack((case0.f_obs, case1.f_obs)).T - f_exp = np.vstack((np.ones_like(case0.f_obs)*np.mean(case0.f_obs), - case1.f_exp)).T - - expected_chi2 = [case0.chi2, case1.chi2] - - # ddof has shape (2, 1). This is broadcast with the computed - # statistic, so p will have shape (2,2). - ddof = np.array([[0], [1]]) - - stat, p = stats.power_divergence(f_obs, f_exp, ddof=ddof) - assert_allclose(stat, expected_chi2) - - # Compute the p values separately, passing in scalars for ddof. - stat0, p0 = stats.power_divergence(f_obs, f_exp, ddof=ddof[0,0]) - stat1, p1 = stats.power_divergence(f_obs, f_exp, ddof=ddof[1,0]) - - assert_array_equal(p, np.vstack((p0, p1))) - - def test_empty_cases(self): - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - for case in power_div_empty_cases: - yield (self.check_power_divergence, - case.f_obs, case.f_exp, case.ddof, case.axis, - "pearson", case.chi2) - yield (self.check_power_divergence, - case.f_obs, case.f_exp, case.ddof, case.axis, - "log-likelihood", case.log) - yield (self.check_power_divergence, - case.f_obs, case.f_exp, case.ddof, case.axis, - "mod-log-likelihood", case.mod_log) - yield (self.check_power_divergence, - case.f_obs, case.f_exp, case.ddof, case.axis, - "cressie-read", case.cr) - - -def test_chisquare_masked_arrays(): - # Test masked arrays. - obs = np.array([[8, 8, 16, 32, -1], [-1, -1, 3, 4, 5]]).T - mask = np.array([[0, 0, 0, 0, 1], [1, 1, 0, 0, 0]]).T - mobs = np.ma.masked_array(obs, mask) - expected_chisq = np.array([24.0, 0.5]) - expected_g = np.array([2*(2*8*np.log(0.5) + 32*np.log(2.0)), - 2*(3*np.log(0.75) + 5*np.log(1.25))]) - - chisq, p = stats.chisquare(mobs) - mat.assert_array_equal(chisq, expected_chisq) - mat.assert_array_almost_equal(p, stats.chisqprob(expected_chisq, - mobs.count(axis=0) - 1)) - - g, p = stats.power_divergence(mobs, lambda_='log-likelihood') - mat.assert_array_almost_equal(g, expected_g, decimal=15) - mat.assert_array_almost_equal(p, stats.chisqprob(expected_g, - mobs.count(axis=0) - 1)) - - chisq, p = stats.chisquare(mobs.T, axis=1) - mat.assert_array_equal(chisq, expected_chisq) - mat.assert_array_almost_equal(p, - stats.chisqprob(expected_chisq, - mobs.T.count(axis=1) - 1)) - - g, p = stats.power_divergence(mobs.T, axis=1, lambda_="log-likelihood") - mat.assert_array_almost_equal(g, expected_g, decimal=15) - mat.assert_array_almost_equal(p, stats.chisqprob(expected_g, - mobs.count(axis=0) - 1)) - - obs1 = np.ma.array([3, 5, 6, 99, 10], mask=[0, 0, 0, 1, 0]) - exp1 = np.ma.array([2, 4, 8, 10, 99], mask=[0, 0, 0, 0, 1]) - chi2, p = stats.chisquare(obs1, f_exp=exp1) - # Because of the mask at index 3 of obs1 and at index 4 of exp1, - # only the first three elements are included in the calculation - # of the statistic. - mat.assert_array_equal(chi2, 1/2 + 1/4 + 4/8) - - # When axis=None, the two values should have type np.float64. - chisq, p = stats.chisquare(np.ma.array([1,2,3]), axis=None) - assert_(isinstance(chisq, np.float64)) - assert_(isinstance(p, np.float64)) - assert_equal(chisq, 1.0) - assert_almost_equal(p, stats.chisqprob(1.0, 2)) - - # Empty arrays: - # A data set with length 0 returns a masked scalar. - with np.errstate(invalid='ignore'): - chisq, p = stats.chisquare(np.ma.array([])) - assert_(isinstance(chisq, np.ma.MaskedArray)) - assert_equal(chisq.shape, ()) - assert_(chisq.mask) - - empty3 = np.ma.array([[],[],[]]) - - # empty3 is a collection of 0 data sets (whose lengths would be 3, if - # there were any), so the return value is an array with length 0. - chisq, p = stats.chisquare(empty3) - assert_(isinstance(chisq, np.ma.MaskedArray)) - mat.assert_array_equal(chisq, []) - - # empty3.T is an array containing 3 data sets, each with length 0, - # so an array of size (3,) is returned, with all values masked. - with np.errstate(invalid='ignore'): - chisq, p = stats.chisquare(empty3.T) - assert_(isinstance(chisq, np.ma.MaskedArray)) - assert_equal(chisq.shape, (3,)) - assert_(np.all(chisq.mask)) - - -def test_power_divergence_against_cressie_read_data(): - # Test stats.power_divergence against tables 4 and 5 from - # Cressie and Read, "Multimonial Goodness-of-Fit Tests", - # J. R. Statist. Soc. B (1984), Vol 46, No. 3, pp. 440-464. - # This tests the calculation for several values of lambda. - - # `table4` holds just the second and third columns from Table 4. - table4 = np.array([ - # observed, expected, - 15, 15.171, - 11, 13.952, - 14, 12.831, - 17, 11.800, - 5, 10.852, - 11, 9.9796, - 10, 9.1777, - 4, 8.4402, - 8, 7.7620, - 10, 7.1383, - 7, 6.5647, - 9, 6.0371, - 11, 5.5520, - 3, 5.1059, - 6, 4.6956, - 1, 4.3183, - 1, 3.9713, - 4, 3.6522, - ]).reshape(-1, 2) - table5 = np.array([ - # lambda, statistic - -10.0, 72.2e3, - -5.0, 28.9e1, - -3.0, 65.6, - -2.0, 40.6, - -1.5, 34.0, - -1.0, 29.5, - -0.5, 26.5, - 0.0, 24.6, - 0.5, 23.4, - 0.67, 23.1, - 1.0, 22.7, - 1.5, 22.6, - 2.0, 22.9, - 3.0, 24.8, - 5.0, 35.5, - 10.0, 21.4e1, - ]).reshape(-1, 2) - - for lambda_, expected_stat in table5: - stat, p = stats.power_divergence(table4[:,0], table4[:,1], - lambda_=lambda_) - assert_allclose(stat, expected_stat, rtol=5e-3) - - -def test_friedmanchisquare(): - # see ticket:113 - # verified with matlab and R - # From Demsar "Statistical Comparisons of Classifiers over Multiple Data Sets" - # 2006, Xf=9.28 (no tie handling, tie corrected Xf >=9.28) - x1 = [array([0.763, 0.599, 0.954, 0.628, 0.882, 0.936, 0.661, 0.583, - 0.775, 1.0, 0.94, 0.619, 0.972, 0.957]), - array([0.768, 0.591, 0.971, 0.661, 0.888, 0.931, 0.668, 0.583, - 0.838, 1.0, 0.962, 0.666, 0.981, 0.978]), - array([0.771, 0.590, 0.968, 0.654, 0.886, 0.916, 0.609, 0.563, - 0.866, 1.0, 0.965, 0.614, 0.9751, 0.946]), - array([0.798, 0.569, 0.967, 0.657, 0.898, 0.931, 0.685, 0.625, - 0.875, 1.0, 0.962, 0.669, 0.975, 0.970])] - - # From "Bioestadistica para las ciencias de la salud" Xf=18.95 p<0.001: - x2 = [array([4,3,5,3,5,3,2,5,4,4,4,3]), - array([2,2,1,2,3,1,2,3,2,1,1,3]), - array([2,4,3,3,4,3,3,4,4,1,2,1]), - array([3,5,4,3,4,4,3,3,3,4,4,4])] - - # From Jerrorl H. Zar, "Biostatistical Analysis"(example 12.6), Xf=10.68, 0.005 < p < 0.01: - # Probability from this example is inexact using Chisquare aproximation of Friedman Chisquare. - x3 = [array([7.0,9.9,8.5,5.1,10.3]), - array([5.3,5.7,4.7,3.5,7.7]), - array([4.9,7.6,5.5,2.8,8.4]), - array([8.8,8.9,8.1,3.3,9.1])] - - assert_array_almost_equal(stats.friedmanchisquare(x1[0],x1[1],x1[2],x1[3]), - (10.2283464566929, 0.0167215803284414)) - assert_array_almost_equal(stats.friedmanchisquare(x2[0],x2[1],x2[2],x2[3]), - (18.9428571428571, 0.000280938375189499)) - assert_array_almost_equal(stats.friedmanchisquare(x3[0],x3[1],x3[2],x3[3]), - (10.68, 0.0135882729582176)) - np.testing.assert_raises(ValueError, stats.friedmanchisquare,x3[0],x3[1]) - - # test using mstats - assert_array_almost_equal(stats.mstats.friedmanchisquare(x1[0],x1[1],x1[2],x1[3]), - (10.2283464566929, 0.0167215803284414)) - # the following fails - # assert_array_almost_equal(stats.mstats.friedmanchisquare(x2[0],x2[1],x2[2],x2[3]), - # (18.9428571428571, 0.000280938375189499)) - assert_array_almost_equal(stats.mstats.friedmanchisquare(x3[0],x3[1],x3[2],x3[3]), - (10.68, 0.0135882729582176)) - np.testing.assert_raises(ValueError,stats.mstats.friedmanchisquare,x3[0],x3[1]) - - -def test_kstest(): - # from numpy.testing import assert_almost_equal - - # comparing with values from R - x = np.linspace(-1,1,9) - D,p = stats.kstest(x,'norm') - assert_almost_equal(D, 0.15865525393145705, 12) - assert_almost_equal(p, 0.95164069201518386, 1) - - x = np.linspace(-15,15,9) - D,p = stats.kstest(x,'norm') - assert_almost_equal(D, 0.44435602715924361, 15) - assert_almost_equal(p, 0.038850140086788665, 8) - - # the following tests rely on deterministicaly replicated rvs - np.random.seed(987654321) - x = stats.norm.rvs(loc=0.2, size=100) - D,p = stats.kstest(x, 'norm', mode='asymp') - assert_almost_equal(D, 0.12464329735846891, 15) - assert_almost_equal(p, 0.089444888711820769, 15) - assert_almost_equal(np.array(stats.kstest(x, 'norm', mode='asymp')), - np.array((0.12464329735846891, 0.089444888711820769)), 15) - assert_almost_equal(np.array(stats.kstest(x,'norm', alternative='less')), - np.array((0.12464329735846891, 0.040989164077641749)), 15) - # this 'greater' test fails with precision of decimal=14 - assert_almost_equal(np.array(stats.kstest(x,'norm', alternative='greater')), - np.array((0.0072115233216310994, 0.98531158590396228)), 12) - - # missing: no test that uses *args - - -def test_ks_2samp(): - # exact small sample solution - data1 = np.array([1.0,2.0]) - data2 = np.array([1.0,2.0,3.0]) - assert_almost_equal(np.array(stats.ks_2samp(data1+0.01,data2)), - np.array((0.33333333333333337, 0.99062316386915694))) - assert_almost_equal(np.array(stats.ks_2samp(data1-0.01,data2)), - np.array((0.66666666666666674, 0.42490954988801982))) - # these can also be verified graphically - assert_almost_equal( - np.array(stats.ks_2samp(np.linspace(1,100,100), - np.linspace(1,100,100)+2+0.1)), - np.array((0.030000000000000027, 0.99999999996005062))) - assert_almost_equal( - np.array(stats.ks_2samp(np.linspace(1,100,100), - np.linspace(1,100,100)+2-0.1)), - np.array((0.020000000000000018, 0.99999999999999933))) - # these are just regression tests - assert_almost_equal( - np.array(stats.ks_2samp(np.linspace(1,100,100), - np.linspace(1,100,110)+20.1)), - np.array((0.21090909090909091, 0.015880386730710221))) - assert_almost_equal( - np.array(stats.ks_2samp(np.linspace(1,100,100), - np.linspace(1,100,110)+20-0.1)), - np.array((0.20818181818181825, 0.017981441789762638))) - - -def test_ttest_rel(): - # regression test - tr,pr = 0.81248591389165692, 0.41846234511362157 - tpr = ([tr,-tr],[pr,pr]) - - rvs1 = np.linspace(1,100,100) - rvs2 = np.linspace(1.01,99.989,100) - rvs1_2D = np.array([np.linspace(1,100,100), np.linspace(1.01,99.989,100)]) - rvs2_2D = np.array([np.linspace(1.01,99.989,100), np.linspace(1,100,100)]) - - t,p = stats.ttest_rel(rvs1, rvs2, axis=0) - assert_array_almost_equal([t,p],(tr,pr)) - t,p = stats.ttest_rel(rvs1_2D.T, rvs2_2D.T, axis=0) - assert_array_almost_equal([t,p],tpr) - t,p = stats.ttest_rel(rvs1_2D, rvs2_2D, axis=1) - assert_array_almost_equal([t,p],tpr) - - # test on 3 dimensions - rvs1_3D = np.dstack([rvs1_2D,rvs1_2D,rvs1_2D]) - rvs2_3D = np.dstack([rvs2_2D,rvs2_2D,rvs2_2D]) - t,p = stats.ttest_rel(rvs1_3D, rvs2_3D, axis=1) - assert_array_almost_equal(np.abs(t), tr) - assert_array_almost_equal(np.abs(p), pr) - assert_equal(t.shape, (2, 3)) - - t,p = stats.ttest_rel(np.rollaxis(rvs1_3D,2), np.rollaxis(rvs2_3D,2), axis=2) - assert_array_almost_equal(np.abs(t), tr) - assert_array_almost_equal(np.abs(p), pr) - assert_equal(t.shape, (3, 2)) - - olderr = np.seterr(all='ignore') - try: - # test zero division problem - t,p = stats.ttest_rel([0,0,0],[1,1,1]) - assert_equal((np.abs(t),p), (np.inf, 0)) - assert_equal(stats.ttest_rel([0,0,0], [0,0,0]), (np.nan, np.nan)) - - # check that nan in input array result in nan output - anan = np.array([[1,np.nan],[-1,1]]) - assert_equal(stats.ttest_ind(anan, np.zeros((2,2))),([0, np.nan], [1,np.nan])) - finally: - np.seterr(**olderr) - - # test incorrect input shape raise an error - x = np.arange(24) - assert_raises(ValueError, stats.ttest_rel, x.reshape((8, 3)), - x.reshape((2, 3, 4))) - - -def test_ttest_ind(): - # regression test - tr = 1.0912746897927283 - pr = 0.27647818616351882 - tpr = ([tr,-tr],[pr,pr]) - - rvs2 = np.linspace(1,100,100) - rvs1 = np.linspace(5,105,100) - rvs1_2D = np.array([rvs1, rvs2]) - rvs2_2D = np.array([rvs2, rvs1]) - - t,p = stats.ttest_ind(rvs1, rvs2, axis=0) - assert_array_almost_equal([t,p],(tr,pr)) - t,p = stats.ttest_ind(rvs1_2D.T, rvs2_2D.T, axis=0) - assert_array_almost_equal([t,p],tpr) - t,p = stats.ttest_ind(rvs1_2D, rvs2_2D, axis=1) - assert_array_almost_equal([t,p],tpr) - - # test on 3 dimensions - rvs1_3D = np.dstack([rvs1_2D,rvs1_2D,rvs1_2D]) - rvs2_3D = np.dstack([rvs2_2D,rvs2_2D,rvs2_2D]) - t,p = stats.ttest_ind(rvs1_3D, rvs2_3D, axis=1) - assert_almost_equal(np.abs(t), np.abs(tr)) - assert_array_almost_equal(np.abs(p), pr) - assert_equal(t.shape, (2, 3)) - - t,p = stats.ttest_ind(np.rollaxis(rvs1_3D,2), np.rollaxis(rvs2_3D,2), axis=2) - assert_array_almost_equal(np.abs(t), np.abs(tr)) - assert_array_almost_equal(np.abs(p), pr) - assert_equal(t.shape, (3, 2)) - - olderr = np.seterr(all='ignore') - try: - # test zero division problem - t,p = stats.ttest_ind([0,0,0],[1,1,1]) - assert_equal((np.abs(t),p), (np.inf, 0)) - assert_equal(stats.ttest_ind([0,0,0], [0,0,0]), (np.nan, np.nan)) - - # check that nan in input array result in nan output - anan = np.array([[1,np.nan],[-1,1]]) - assert_equal(stats.ttest_ind(anan, np.zeros((2,2))),([0, np.nan], [1,np.nan])) - finally: - np.seterr(**olderr) - - -def test_ttest_ind_with_uneq_var(): - # check vs. R - a = (1, 2, 3) - b = (1.1, 2.9, 4.2) - pr = 0.53619490753126731 - tr = -0.68649512735572582 - t, p = stats.ttest_ind(a, b, equal_var=False) - assert_array_almost_equal([t,p], [tr, pr]) - - a = (1, 2, 3, 4) - pr = 0.84354139131608286 - tr = -0.2108663315950719 - t, p = stats.ttest_ind(a, b, equal_var=False) - assert_array_almost_equal([t,p], [tr, pr]) - - # regression test - tr = 1.0912746897927283 - tr_uneq_n = 0.66745638708050492 - pr = 0.27647831993021388 - pr_uneq_n = 0.50873585065616544 - tpr = ([tr,-tr],[pr,pr]) - - rvs3 = np.linspace(1,100, 25) - rvs2 = np.linspace(1,100,100) - rvs1 = np.linspace(5,105,100) - rvs1_2D = np.array([rvs1, rvs2]) - rvs2_2D = np.array([rvs2, rvs1]) - - t,p = stats.ttest_ind(rvs1, rvs2, axis=0, equal_var=False) - assert_array_almost_equal([t,p],(tr,pr)) - t,p = stats.ttest_ind(rvs1, rvs3, axis=0, equal_var=False) - assert_array_almost_equal([t,p], (tr_uneq_n, pr_uneq_n)) - t,p = stats.ttest_ind(rvs1_2D.T, rvs2_2D.T, axis=0, equal_var=False) - assert_array_almost_equal([t,p],tpr) - t,p = stats.ttest_ind(rvs1_2D, rvs2_2D, axis=1, equal_var=False) - assert_array_almost_equal([t,p],tpr) - - # test on 3 dimensions - rvs1_3D = np.dstack([rvs1_2D,rvs1_2D,rvs1_2D]) - rvs2_3D = np.dstack([rvs2_2D,rvs2_2D,rvs2_2D]) - t,p = stats.ttest_ind(rvs1_3D, rvs2_3D, axis=1, equal_var=False) - assert_almost_equal(np.abs(t), np.abs(tr)) - assert_array_almost_equal(np.abs(p), pr) - assert_equal(t.shape, (2, 3)) - - t,p = stats.ttest_ind(np.rollaxis(rvs1_3D,2), np.rollaxis(rvs2_3D,2), - axis=2, equal_var=False) - assert_array_almost_equal(np.abs(t), np.abs(tr)) - assert_array_almost_equal(np.abs(p), pr) - assert_equal(t.shape, (3, 2)) - - olderr = np.seterr(all='ignore') - try: - # test zero division problem - t,p = stats.ttest_ind([0,0,0],[1,1,1], equal_var=False) - assert_equal((np.abs(t),p), (np.inf, 0)) - assert_equal(stats.ttest_ind([0,0,0], [0,0,0], equal_var=False), (np.nan, np.nan)) - - # check that nan in input array result in nan output - anan = np.array([[1,np.nan],[-1,1]]) - assert_equal(stats.ttest_ind(anan, np.zeros((2,2)), equal_var=False), - ([0, np.nan], [1,np.nan])) - finally: - np.seterr(**olderr) - - -def test_ttest_1samp_new(): - n1, n2, n3 = (10,15,20) - rvn1 = stats.norm.rvs(loc=5,scale=10,size=(n1,n2,n3)) - - # check multidimensional array and correct axis handling - # deterministic rvn1 and rvn2 would be better as in test_ttest_rel - t1,p1 = stats.ttest_1samp(rvn1[:,:,:], np.ones((n2,n3)),axis=0) - t2,p2 = stats.ttest_1samp(rvn1[:,:,:], 1,axis=0) - t3,p3 = stats.ttest_1samp(rvn1[:,0,0], 1) - assert_array_almost_equal(t1,t2, decimal=14) - assert_almost_equal(t1[0,0],t3, decimal=14) - assert_equal(t1.shape, (n2,n3)) - - t1,p1 = stats.ttest_1samp(rvn1[:,:,:], np.ones((n1,n3)),axis=1) - t2,p2 = stats.ttest_1samp(rvn1[:,:,:], 1,axis=1) - t3,p3 = stats.ttest_1samp(rvn1[0,:,0], 1) - assert_array_almost_equal(t1,t2, decimal=14) - assert_almost_equal(t1[0,0],t3, decimal=14) - assert_equal(t1.shape, (n1,n3)) - - t1,p1 = stats.ttest_1samp(rvn1[:,:,:], np.ones((n1,n2)),axis=2) - t2,p2 = stats.ttest_1samp(rvn1[:,:,:], 1,axis=2) - t3,p3 = stats.ttest_1samp(rvn1[0,0,:], 1) - assert_array_almost_equal(t1,t2, decimal=14) - assert_almost_equal(t1[0,0],t3, decimal=14) - assert_equal(t1.shape, (n1,n2)) - - olderr = np.seterr(all='ignore') - try: - # test zero division problem - t,p = stats.ttest_1samp([0,0,0], 1) - assert_equal((np.abs(t),p), (np.inf, 0)) - assert_equal(stats.ttest_1samp([0,0,0], 0), (np.nan, np.nan)) - - # check that nan in input array result in nan output - anan = np.array([[1,np.nan],[-1,1]]) - assert_equal(stats.ttest_1samp(anan, 0),([0, np.nan], [1,np.nan])) - finally: - np.seterr(**olderr) - - -def test_describe(): - x = np.vstack((np.ones((3,4)),2*np.ones((2,4)))) - nc, mmc = (5, ([1., 1., 1., 1.], [2., 2., 2., 2.])) - mc = np.array([1.4, 1.4, 1.4, 1.4]) - vc = np.array([0.3, 0.3, 0.3, 0.3]) - skc = [0.40824829046386357]*4 - kurtc = [-1.833333333333333]*4 - n, mm, m, v, sk, kurt = stats.describe(x) - assert_equal(n, nc) - assert_equal(mm, mmc) - assert_equal(m, mc) - assert_equal(v, vc) - assert_array_almost_equal(sk, skc, decimal=13) # not sure about precision - assert_array_almost_equal(kurt, kurtc, decimal=13) - n, mm, m, v, sk, kurt = stats.describe(x.T, axis=1) - assert_equal(n, nc) - assert_equal(mm, mmc) - assert_equal(m, mc) - assert_equal(v, vc) - assert_array_almost_equal(sk, skc, decimal=13) # not sure about precision - assert_array_almost_equal(kurt, kurtc, decimal=13) - - -def test_normalitytests(): - # numbers verified with R: dagoTest in package fBasics - st_normal, st_skew, st_kurt = (3.92371918, 1.98078826, -0.01403734) - pv_normal, pv_skew, pv_kurt = (0.14059673, 0.04761502, 0.98880019) - x = np.array((-2,-1,0,1,2,3)*4)**2 - yield assert_array_almost_equal, stats.normaltest(x), (st_normal, pv_normal) - yield assert_array_almost_equal, stats.skewtest(x), (st_skew, pv_skew) - yield assert_array_almost_equal, stats.kurtosistest(x), (st_kurt, pv_kurt) - - # Test axis=None (equal to axis=0 for 1-D input) - yield (assert_array_almost_equal, stats.normaltest(x, axis=None), - (st_normal, pv_normal)) - yield (assert_array_almost_equal, stats.skewtest(x, axis=None), - (st_skew, pv_skew)) - yield (assert_array_almost_equal, stats.kurtosistest(x, axis=None), - (st_kurt, pv_kurt)) - - -class TestJarqueBera(TestCase): - def test_jarque_bera_stats(self): - np.random.seed(987654321) - x = np.random.normal(0, 1, 100000) - y = np.random.chisquare(10000, 100000) - z = np.random.rayleigh(1, 100000) - - assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(y)[1]) - assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(z)[1]) - assert_(stats.jarque_bera(y)[1] > stats.jarque_bera(z)[1]) - - def test_jarque_bera_array_like(self): - np.random.seed(987654321) - x = np.random.normal(0, 1, 100000) - - JB1, p1 = stats.jarque_bera(list(x)) - JB2, p2 = stats.jarque_bera(tuple(x)) - JB3, p3 = stats.jarque_bera(x.reshape(2, 50000)) - - assert_(JB1 == JB2 == JB3) - assert_(p1 == p2 == p3) - - def test_jarque_bera_size(self): - assert_raises(ValueError, stats.jarque_bera, []) - - -def test_skewtest_too_few_samples(): - # Regression test for ticket #1492. - # skewtest requires at least 8 samples; 7 should raise a ValueError. - x = np.arange(7.0) - assert_raises(ValueError, stats.skewtest, x) - - -def test_kurtosistest_too_few_samples(): - # Regression test for ticket #1425. - # kurtosistest requires at least 5 samples; 4 should raise a ValueError. - x = np.arange(4.0) - assert_raises(ValueError, stats.kurtosistest, x) - - -def test_mannwhitneyu(): - x = np.array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 2., 1., 1., 2., 1., 1., - 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 1., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 1., 1.]) - - y = np.array([1., 1., 1., 1., 1., 1., 1., 2., 1., 2., 1., 1., 1., - 1., 2., 1., 1., 1., 2., 1., 1., 1., 1., 1., 2., 1., 1., 3., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 2., 1., 1., 1., 1., - 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 2., 1., 1., 1., 1., 1., 2., 2., 1., 1., 2., 1., 1., 2., - 1., 2., 1., 1., 1., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 2., 2., 2., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - 1., 2., 1., 1., 2., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 2., 1., 1., 1., 1., 1., - 1.]) - # p-value verified with matlab and R to 5 significant digits - assert_array_almost_equal(stats.stats.mannwhitneyu(x,y), - (16980.5, 2.8214327656317373e-005), decimal=12) - - -def test_pointbiserial(): - # same as mstats test except for the nan - # Test data: http://support.sas.com/ctx/samples/index.jsp?sid=490&tab=output - x = [1,0,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0, - 0,0,0,0,1] - y = [14.8,13.8,12.4,10.1,7.1,6.1,5.8,4.6,4.3,3.5,3.3,3.2,3.0, - 2.8,2.8,2.5,2.4,2.3,2.1,1.7,1.7,1.5,1.3,1.3,1.2,1.2,1.1, - 0.8,0.7,0.6,0.5,0.2,0.2,0.1] - assert_almost_equal(stats.pointbiserialr(x, y)[0], 0.36149, 5) - - -def test_obrientransform(): - # A couple tests calculated by hand. - x1 = np.array([0, 2, 4]) - t1 = stats.obrientransform(x1) - expected = [7, -2, 7] - assert_allclose(t1[0], expected) - - x2 = np.array([0, 3, 6, 9]) - t2 = stats.obrientransform(x2) - expected = np.array([30, 0, 0, 30]) - assert_allclose(t2[0], expected) - - # Test two arguments. - a, b = stats.obrientransform(x1, x2) - assert_equal(a, t1[0]) - assert_equal(b, t2[0]) - - # Test three arguments. - a, b, c = stats.obrientransform(x1, x2, x1) - assert_equal(a, t1[0]) - assert_equal(b, t2[0]) - assert_equal(c, t1[0]) - - # This is a regression test to check np.var replacement. - # The author of this test didn't separately verify the numbers. - x1 = np.arange(5) - result = np.array( - [[5.41666667, 1.04166667, -0.41666667, 1.04166667, 5.41666667], - [21.66666667, 4.16666667, -1.66666667, 4.16666667, 21.66666667]]) - assert_array_almost_equal(stats.obrientransform(x1, 2*x1), result, decimal=8) - - # Example from "O'Brien Test for Homogeneity of Variance" - # by Herve Abdi. - values = range(5, 11) - reps = np.array([5, 11, 9, 3, 2, 2]) - data = np.repeat(values, reps) - transformed_values = np.array([3.1828, 0.5591, 0.0344, - 1.6086, 5.2817, 11.0538]) - expected = np.repeat(transformed_values, reps) - result = stats.obrientransform(data) - assert_array_almost_equal(result[0], expected, decimal=4) - - -class HarMeanTestCase: - def test_1dlist(self): - # Test a 1d list - a = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] - b = 34.1417152147 - self.do(a, b) - - def test_1darray(self): - # Test a 1d array - a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) - b = 34.1417152147 - self.do(a, b) - - def test_1dma(self): - # Test a 1d masked array - a = np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) - b = 34.1417152147 - self.do(a, b) - - def test_1dmavalue(self): - # Test a 1d masked array with a masked value - a = np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], - mask=[0,0,0,0,0,0,0,0,0,1]) - b = 31.8137186141 - self.do(a, b) - - # Note the next tests use axis=None as default, not axis=0 - def test_2dlist(self): - # Test a 2d list - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = 38.6696271841 - self.do(a, b) - - def test_2darray(self): - # Test a 2d array - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = 38.6696271841 - self.do(np.array(a), b) - - def test_2dma(self): - # Test a 2d masked array - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = 38.6696271841 - self.do(np.ma.array(a), b) - - def test_2daxis0(self): - # Test a 2d list with axis=0 - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.array([22.88135593, 39.13043478, 52.90076336, 65.45454545]) - self.do(a, b, axis=0) - - def test_2daxis1(self): - # Test a 2d list with axis=1 - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.array([19.2, 63.03939962, 103.80078637]) - self.do(a, b, axis=1) - - def test_2dmatrixdaxis0(self): - # Test a 2d list with axis=0 - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.matrix([[22.88135593, 39.13043478, 52.90076336, 65.45454545]]) - self.do(np.matrix(a), b, axis=0) - - def test_2dmatrixaxis1(self): - # Test a 2d list with axis=1 - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.matrix([[19.2, 63.03939962, 103.80078637]]).T - self.do(np.matrix(a), b, axis=1) - - -class TestHarMean(HarMeanTestCase, TestCase): - def do(self, a, b, axis=None, dtype=None): - x = stats.hmean(a, axis=axis, dtype=dtype) - assert_almost_equal(b, x) - assert_equal(x.dtype, dtype) - - -class GeoMeanTestCase: - def test_1dlist(self): - # Test a 1d list - a = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] - b = 45.2872868812 - self.do(a, b) - - def test_1darray(self): - # Test a 1d array - a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) - b = 45.2872868812 - self.do(a, b) - - def test_1dma(self): - # Test a 1d masked array - a = np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) - b = 45.2872868812 - self.do(a, b) - - def test_1dmavalue(self): - # Test a 1d masked array with a masked value - a = np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], mask=[0,0,0,0,0,0,0,0,0,1]) - b = 41.4716627439 - self.do(a, b) - - # Note the next tests use axis=None as default, not axis=0 - def test_2dlist(self): - # Test a 2d list - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = 52.8885199 - self.do(a, b) - - def test_2darray(self): - # Test a 2d array - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = 52.8885199 - self.do(np.array(a), b) - - def test_2dma(self): - # Test a 2d masked array - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = 52.8885199 - self.do(np.ma.array(a), b) - - def test_2daxis0(self): - # Test a 2d list with axis=0 - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.array([35.56893304, 49.32424149, 61.3579244, 72.68482371]) - self.do(a, b, axis=0) - - def test_2daxis1(self): - # Test a 2d list with axis=1 - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.array([22.13363839, 64.02171746, 104.40086817]) - self.do(a, b, axis=1) - - def test_2dmatrixdaxis0(self): - # Test a 2d list with axis=0 - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.matrix([[35.56893304, 49.32424149, 61.3579244, 72.68482371]]) - self.do(np.matrix(a), b, axis=0) - - def test_2dmatrixaxis1(self): - # Test a 2d list with axis=1 - a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.matrix([[22.13363839, 64.02171746, 104.40086817]]).T - self.do(np.matrix(a), b, axis=1) - - def test_1dlist0(self): - # Test a 1d list with zero element - a = [10, 20, 30, 40, 50, 60, 70, 80, 90, 0] - b = 0.0 # due to exp(-inf)=0 - olderr = np.seterr(all='ignore') - try: - self.do(a, b) - finally: - np.seterr(**olderr) - - def test_1darray0(self): - # Test a 1d array with zero element - a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 0]) - b = 0.0 # due to exp(-inf)=0 - olderr = np.seterr(all='ignore') - try: - self.do(a, b) - finally: - np.seterr(**olderr) - - def test_1dma0(self): - # Test a 1d masked array with zero element - a = np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 0]) - b = 41.4716627439 - olderr = np.seterr(all='ignore') - try: - self.do(a, b) - finally: - np.seterr(**olderr) - - def test_1dmainf(self): - # Test a 1d masked array with negative element - a = np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, -1]) - b = 41.4716627439 - olderr = np.seterr(all='ignore') - try: - self.do(a, b) - finally: - np.seterr(**olderr) - - -class TestGeoMean(GeoMeanTestCase, TestCase): - def do(self, a, b, axis=None, dtype=None): - # Note this doesn't test when axis is not specified - x = stats.gmean(a, axis=axis, dtype=dtype) - assert_almost_equal(b, x) - assert_equal(x.dtype, dtype) - - -def test_binomtest(): - # precision tests compared to R for ticket:986 - pp = np.concatenate((np.linspace(0.1,0.2,5), np.linspace(0.45,0.65,5), - np.linspace(0.85,0.95,5))) - n = 501 - x = 450 - results = [0.0, 0.0, 1.0159969301994141e-304, - 2.9752418572150531e-275, 7.7668382922535275e-250, - 2.3381250925167094e-099, 7.8284591587323951e-081, - 9.9155947819961383e-065, 2.8729390725176308e-050, - 1.7175066298388421e-037, 0.0021070691951093692, - 0.12044570587262322, 0.88154763174802508, 0.027120993063129286, - 2.6102587134694721e-006] - - for p, res in zip(pp,results): - assert_approx_equal(stats.binom_test(x, n, p), res, - significant=12, err_msg='fail forp=%f' % p) - - assert_approx_equal(stats.binom_test(50,100,0.1), 5.8320387857343647e-024, - significant=12, err_msg='fail forp=%f' % p) - - -def test_binomtest2(): - # test added for issue #2384 - res2 = [ - [1.0, 1.0], - [0.5,1.0,0.5], - [0.25,1.00,1.00,0.25], - [0.125,0.625,1.000,0.625,0.125], - [0.0625,0.3750,1.0000,1.0000,0.3750,0.0625], - [0.03125,0.21875,0.68750,1.00000,0.68750,0.21875,0.03125], - [0.015625,0.125000,0.453125,1.000000,1.000000,0.453125,0.125000,0.015625], - [0.0078125,0.0703125,0.2890625,0.7265625,1.0000000,0.7265625,0.2890625, - 0.0703125,0.0078125], - [0.00390625,0.03906250,0.17968750,0.50781250,1.00000000,1.00000000, - 0.50781250,0.17968750,0.03906250,0.00390625], - [0.001953125,0.021484375,0.109375000,0.343750000,0.753906250,1.000000000, - 0.753906250,0.343750000,0.109375000,0.021484375,0.001953125] - ] - - for k in range(1, 11): - res1 = [stats.binom_test(v, k, 0.5) for v in range(k + 1)] - assert_almost_equal(res1, res2[k-1], decimal=10) - - -def test_binomtest3(): - # test added for issue #2384 - # test when x == n*p and neighbors - res3 = [stats.binom_test(v, v*k, 1./k) for v in range(1, 11) - for k in range(2, 11)] - assert_equal(res3, np.ones(len(res3), int)) - - #> bt=c() - #> for(i in as.single(1:10)){for(k in as.single(2:10)){bt = c(bt, binom.test(i-1, k*i,(1/k))$p.value); print(c(i+1, k*i,(1/k)))}} - binom_testm1 = np.array([ - 0.5, 0.5555555555555556, 0.578125, 0.5904000000000003, - 0.5981224279835393, 0.603430543396034, 0.607304096221924, - 0.610255656871054, 0.612579511000001, 0.625, 0.670781893004115, - 0.68853759765625, 0.6980101120000006, 0.703906431368616, - 0.70793209416498, 0.7108561134173507, 0.713076544331419, - 0.714820192935702, 0.6875, 0.7268709038256367, 0.7418963909149174, - 0.74986110468096, 0.7548015520398076, 0.7581671424768577, - 0.760607984787832, 0.762459425024199, 0.7639120677676575, 0.7265625, - 0.761553963657302, 0.774800934828818, 0.7818005980538996, - 0.78613491480358, 0.789084353140195, 0.7912217659828884, - 0.79284214559524, 0.794112956558801, 0.75390625, 0.7856929451142176, - 0.7976688481430754, 0.8039848974727624, 0.807891868948366, - 0.8105487660137676, 0.812473307174702, 0.8139318233591120, - 0.815075399104785, 0.7744140625, 0.8037322594985427, - 0.814742863657656, 0.8205425178645808, 0.8241275984172285, - 0.8265645374416, 0.8283292196088257, 0.829666291102775, - 0.8307144686362666, 0.7905273437499996, 0.8178712053954738, - 0.828116983756619, 0.833508948940494, 0.8368403871552892, - 0.839104213210105, 0.840743186196171, 0.84198481438049, - 0.8429580531563676, 0.803619384765625, 0.829338573944648, - 0.8389591907548646, 0.84401876783902, 0.84714369697889, - 0.8492667010581667, 0.850803474598719, 0.851967542858308, - 0.8528799045949524, 0.8145294189453126, 0.838881732845347, - 0.847979024541911, 0.852760894015685, 0.8557134656773457, - 0.8577190131799202, 0.85917058278431, 0.860270010472127, - 0.861131648404582, 0.823802947998047, 0.846984756807511, - 0.855635653643743, 0.860180994825685, 0.86298688573253, - 0.864892525675245, 0.866271647085603, 0.867316125625004, - 0.8681346531755114 - ]) - - # > bt=c() - # > for(i in as.single(1:10)){for(k in as.single(2:10)){bt = c(bt, binom.test(i+1, k*i,(1/k))$p.value); print(c(i+1, k*i,(1/k)))}} - - binom_testp1 = np.array([ - 0.5, 0.259259259259259, 0.26171875, 0.26272, 0.2632244513031551, - 0.2635138663069203, 0.2636951804161073, 0.2638162407564354, - 0.2639010709000002, 0.625, 0.4074074074074074, 0.42156982421875, - 0.4295746560000003, 0.43473045988554, 0.4383309503172684, - 0.4409884859402103, 0.4430309389962837, 0.444649849401104, 0.6875, - 0.4927602499618962, 0.5096031427383425, 0.5189636628480, - 0.5249280070771274, 0.5290623300865124, 0.5320974248125793, - 0.5344204730474308, 0.536255847400756, 0.7265625, 0.5496019313526808, - 0.5669248746708034, 0.576436455045805, 0.5824538812831795, - 0.5866053321547824, 0.589642781414643, 0.5919618019300193, - 0.593790427805202, 0.75390625, 0.590868349763505, 0.607983393277209, - 0.617303847446822, 0.623172512167948, 0.627208862156123, - 0.6301556891501057, 0.632401894928977, 0.6341708982290303, - 0.7744140625, 0.622562037497196, 0.639236102912278, 0.648263335014579, - 0.65392850011132, 0.657816519817211, 0.660650782947676, - 0.662808780346311, 0.6645068560246006, 0.7905273437499996, - 0.6478843304312477, 0.6640468318879372, 0.6727589686071775, - 0.6782129857784873, 0.681950188903695, 0.684671508668418, - 0.686741824999918, 0.688369886732168, 0.803619384765625, - 0.668716055304315, 0.684360013879534, 0.6927642396829181, - 0.6980155964704895, 0.701609591890657, 0.7042244320992127, - 0.7062125081341817, 0.707775152962577, 0.8145294189453126, - 0.686243374488305, 0.7013873696358975, 0.709501223328243, - 0.714563595144314, 0.718024953392931, 0.7205416252126137, - 0.722454130389843, 0.723956813292035, 0.823802947998047, - 0.701255953767043, 0.715928221686075, 0.723772209289768, - 0.7286603031173616, 0.7319999279787631, 0.7344267920995765, - 0.736270323773157, 0.737718376096348 - ]) - - res4_p1 = [stats.binom_test(v+1, v*k, 1./k) for v in range(1, 11) - for k in range(2, 11)] - res4_m1 = [stats.binom_test(v-1, v*k, 1./k) for v in range(1, 11) - for k in range(2, 11)] - - assert_almost_equal(res4_p1, binom_testp1, decimal=13) - assert_almost_equal(res4_m1, binom_testm1, decimal=13) - - -class TestTrim(object): - # test trim functions - def test_trim1(self): - a = np.arange(11) - assert_equal(stats.trim1(a, 0.1), np.arange(10)) - assert_equal(stats.trim1(a, 0.2), np.arange(9)) - assert_equal(stats.trim1(a, 0.2, tail='left'), np.arange(2,11)) - assert_equal(stats.trim1(a, 3/11., tail='left'), np.arange(3,11)) - - def test_trimboth(self): - a = np.arange(11) - assert_equal(stats.trimboth(a, 3/11.), np.arange(3,8)) - assert_equal(stats.trimboth(a, 0.2), np.array([2, 3, 4, 5, 6, 7, 8])) - assert_equal(stats.trimboth(np.arange(24).reshape(6,4), 0.2), - np.arange(4,20).reshape(4,4)) - assert_equal(stats.trimboth(np.arange(24).reshape(4,6).T, 2/6.), - np.array([[2, 8, 14, 20],[3, 9, 15, 21]])) - assert_raises(ValueError, stats.trimboth, - np.arange(24).reshape(4,6).T, 4/6.) - - def test_trim_mean(self): - # don't use pre-sorted arrays - a = np.array([4, 8, 2, 0, 9, 5, 10, 1, 7, 3, 6]) - idx = np.array([3, 5, 0, 1, 2, 4]) - a2 = np.arange(24).reshape(6, 4)[idx, :] - a3 = np.arange(24).reshape(6, 4, order='F')[idx, :] - assert_equal(stats.trim_mean(a3, 2/6.), - np.array([2.5, 8.5, 14.5, 20.5])) - assert_equal(stats.trim_mean(a2, 2/6.), - np.array([10., 11., 12., 13.])) - idx4 = np.array([1, 0, 3, 2]) - a4 = np.arange(24).reshape(4, 6)[idx4, :] - assert_equal(stats.trim_mean(a4, 2/6.), - np.array([9., 10., 11., 12., 13., 14.])) - # shuffled arange(24) as array_like - a = [7, 11, 12, 21, 16, 6, 22, 1, 5, 0, 18, 10, 17, 9, 19, 15, 23, - 20, 2, 14, 4, 13, 8, 3] - assert_equal(stats.trim_mean(a, 2/6.), 11.5) - assert_equal(stats.trim_mean([5,4,3,1,2,0], 2/6.), 2.5) - - # check axis argument - np.random.seed(1234) - a = np.random.randint(20, size=(5, 6, 4, 7)) - for axis in [0, 1, 2, 3, -1]: - res1 = stats.trim_mean(a, 2/6., axis=axis) - res2 = stats.trim_mean(np.rollaxis(a, axis), 2/6.) - assert_equal(res1, res2) - - res1 = stats.trim_mean(a, 2/6., axis=None) - res2 = stats.trim_mean(a.ravel(), 2/6.) - assert_equal(res1, res2) - - assert_raises(ValueError, stats.trim_mean, a, 0.6) - - -class TestSigamClip(object): - def test_sigmaclip1(self): - a = np.concatenate((np.linspace(9.5,10.5,31),np.linspace(0,20,5))) - fact = 4 # default - c, low, upp = stats.sigmaclip(a) - assert_(c.min() > low) - assert_(c.max() < upp) - assert_equal(low, c.mean() - fact*c.std()) - assert_equal(upp, c.mean() + fact*c.std()) - assert_equal(c.size, a.size) - - def test_sigmaclip2(self): - a = np.concatenate((np.linspace(9.5,10.5,31),np.linspace(0,20,5))) - fact = 1.5 - c, low, upp = stats.sigmaclip(a, fact, fact) - assert_(c.min() > low) - assert_(c.max() < upp) - assert_equal(low, c.mean() - fact*c.std()) - assert_equal(upp, c.mean() + fact*c.std()) - assert_equal(c.size, 4) - assert_equal(a.size, 36) # check original array unchanged - - def test_sigmaclip3(self): - a = np.concatenate((np.linspace(9.5,10.5,11),np.linspace(-100,-50,3))) - fact = 1.8 - c, low, upp = stats.sigmaclip(a, fact, fact) - assert_(c.min() > low) - assert_(c.max() < upp) - assert_equal(low, c.mean() - fact*c.std()) - assert_equal(upp, c.mean() + fact*c.std()) - assert_equal(c, np.linspace(9.5,10.5,11)) - - -class TestFOneWay(TestCase): - def test_trivial(self): - # A trivial test of stats.f_oneway, with F=0. - F, p = stats.f_oneway([0,2], [0,2]) - assert_equal(F, 0.0) - - def test_basic(self): - # Despite being a floating point calculation, this data should - # result in F being exactly 2.0. - F, p = stats.f_oneway([0,2], [2,4]) - assert_equal(F, 2.0) - - def test_large_integer_array(self): - a = np.array([655, 788], dtype=np.uint16) - b = np.array([789, 772], dtype=np.uint16) - F, p = stats.f_oneway(a, b) - assert_almost_equal(F, 0.77450216931805538) - - -class TestKruskal(TestCase): - def test_simple(self): - x = [1] - y = [2] - h, p = stats.kruskal(x, y) - assert_equal(h, 1.0) - assert_approx_equal(p, stats.chisqprob(h, 1)) - h, p = stats.kruskal(np.array(x), np.array(y)) - assert_equal(h, 1.0) - assert_approx_equal(p, stats.chisqprob(h, 1)) - - def test_basic(self): - x = [1, 3, 5, 7, 9] - y = [2, 4, 6, 8, 10] - h, p = stats.kruskal(x, y) - assert_approx_equal(h, 3./11, significant=10) - assert_approx_equal(p, stats.chisqprob(3./11, 1)) - h, p = stats.kruskal(np.array(x), np.array(y)) - assert_approx_equal(h, 3./11, significant=10) - assert_approx_equal(p, stats.chisqprob(3./11, 1)) - - def test_simple_tie(self): - x = [1] - y = [1, 2] - h_uncorr = 1.5**2 + 2*2.25**2 - 12 - corr = 0.75 - expected = h_uncorr / corr # 0.5 - h, p = stats.kruskal(x, y) - # Since the expression is simple and the exact answer is 0.5, it - # should be safe to use assert_equal(). - assert_equal(h, expected) - - def test_another_tie(self): - x = [1, 1, 1, 2] - y = [2, 2, 2, 2] - h_uncorr = (12. / 8. / 9.) * 4 * (3**2 + 6**2) - 3 * 9 - corr = 1 - float(3**3 - 3 + 5**3 - 5) / (8**3 - 8) - expected = h_uncorr / corr - h, p = stats.kruskal(x, y) - assert_approx_equal(h, expected) - - def test_three_groups(self): - # A test of stats.kruskal with three groups, with ties. - x = [1, 1, 1] - y = [2, 2, 2] - z = [2, 2] - h_uncorr = (12. / 8. / 9.) * (3*2**2 + 3*6**2 + 2*6**2) - 3 * 9 # 5.0 - corr = 1 - float(3**3 - 3 + 5**3 - 5) / (8**3 - 8) - expected = h_uncorr / corr # 7.0 - h, p = stats.kruskal(x, y, z) - assert_approx_equal(h, expected) - assert_approx_equal(p, stats.chisqprob(h, 2)) - - -if __name__ == "__main__": - run_module_suite() diff --git a/wafo/stats/tests/test_tukeylambda_stats.py b/wafo/stats/tests/test_tukeylambda_stats.py deleted file mode 100644 index 9d3d654..0000000 --- a/wafo/stats/tests/test_tukeylambda_stats.py +++ /dev/null @@ -1,91 +0,0 @@ -from __future__ import division, print_function, absolute_import - -import numpy as np -from numpy.testing import assert_allclose, assert_equal, run_module_suite - -from scipy.stats._tukeylambda_stats import tukeylambda_variance, \ - tukeylambda_kurtosis - - -def test_tukeylambda_stats_known_exact(): - """Compare results with some known exact formulas.""" - # Some exact values of the Tukey Lambda variance and kurtosis: - # lambda var kurtosis - # 0 pi**2/3 6/5 (logistic distribution) - # 0.5 4 - pi (5/3 - pi/2)/(pi/4 - 1)**2 - 3 - # 1 1/3 -6/5 (uniform distribution on (-1,1)) - # 2 1/12 -6/5 (uniform distribution on (-1/2, 1/2)) - - # lambda = 0 - var = tukeylambda_variance(0) - assert_allclose(var, np.pi**2 / 3, atol=1e-12) - kurt = tukeylambda_kurtosis(0) - assert_allclose(kurt, 1.2, atol=1e-10) - - # lambda = 0.5 - var = tukeylambda_variance(0.5) - assert_allclose(var, 4 - np.pi, atol=1e-12) - kurt = tukeylambda_kurtosis(0.5) - desired = (5./3 - np.pi/2) / (np.pi/4 - 1)**2 - 3 - assert_allclose(kurt, desired, atol=1e-10) - - # lambda = 1 - var = tukeylambda_variance(1) - assert_allclose(var, 1.0 / 3, atol=1e-12) - kurt = tukeylambda_kurtosis(1) - assert_allclose(kurt, -1.2, atol=1e-10) - - # lambda = 2 - var = tukeylambda_variance(2) - assert_allclose(var, 1.0 / 12, atol=1e-12) - kurt = tukeylambda_kurtosis(2) - assert_allclose(kurt, -1.2, atol=1e-10) - - -def test_tukeylambda_stats_mpmath(): - """Compare results with some values that were computed using mpmath.""" - a10 = dict(atol=1e-10, rtol=0) - a12 = dict(atol=1e-12, rtol=0) - data = [ - # lambda variance kurtosis - [-0.1, 4.78050217874253547, 3.78559520346454510], - [-0.0649, 4.16428023599895777, 2.52019675947435718], - [-0.05, 3.93672267890775277, 2.13129793057777277], - [-0.001, 3.30128380390964882, 1.21452460083542988], - [0.001, 3.27850775649572176, 1.18560634779287585], - [0.03125, 2.95927803254615800, 0.804487555161819980], - [0.05, 2.78281053405464501, 0.611604043886644327], - [0.0649, 2.65282386754100551, 0.476834119532774540], - [1.2, 0.242153920578588346, -1.23428047169049726], - [10.0, 0.00095237579757703597, 2.37810697355144933], - [20.0, 0.00012195121951131043, 7.37654321002709531], - ] - - for lam, var_expected, kurt_expected in data: - var = tukeylambda_variance(lam) - assert_allclose(var, var_expected, **a12) - kurt = tukeylambda_kurtosis(lam) - assert_allclose(kurt, kurt_expected, **a10) - - # Test with vector arguments (most of the other tests are for single - # values). - lam, var_expected, kurt_expected = zip(*data) - var = tukeylambda_variance(lam) - assert_allclose(var, var_expected, **a12) - kurt = tukeylambda_kurtosis(lam) - assert_allclose(kurt, kurt_expected, **a10) - - -def test_tukeylambda_stats_invalid(): - """Test values of lambda outside the domains of the functions.""" - lam = [-1.0, -0.5] - var = tukeylambda_variance(lam) - assert_equal(var, np.array([np.nan, np.inf])) - - lam = [-1.0, -0.25] - kurt = tukeylambda_kurtosis(lam) - assert_equal(kurt, np.array([np.nan, np.inf])) - - -if __name__ == "__main__": - run_module_suite() diff --git a/wafo/stats/twolumps.py b/wafo/stats/twolumps.py deleted file mode 100644 index d0eb627..0000000 --- a/wafo/stats/twolumps.py +++ /dev/null @@ -1,412 +0,0 @@ -""" -Commentary ----------- - -Most of the work is done by the scipy.stats.distributions module. - -This provides a plethora of continuous distributions to play with. - -Each distribution has functions to generate random deviates, pdf's, -cdf's etc. as well as a function to fit the distribution to some given -data. - -The fitting uses scipy.optimize.fmin to minimise the log odds of the -data given the distribution. - -There are a couple of problems with this approach. First it is -sensitive to the initial guess at the parameters. Second it can be a -little slow. - -Two key parameters are the 'loc' and 'scale' parameters. Data is -shifted by 'loc' and scaled by scale prior to fitting. Supplying -appropriate values for these parameters is important to getting a good -fit. - -See the factory() function which picks from a handful of common -approaches for each distribution. - -For some distributions (eg normal) it really makes sense just to -calculate the parameters directly from the data. - -The code in the __ifmain__ should be a good guide how to use this. - -Simply: - get a QuickFit object - add the distributions you want to try to fit - call fit() with your data - call fit_stats() to generate some stats on the fit. - call plot() if you want to see a plot. - - -Named after Mrs Twolumps, minister's secretary in the silly walks -sketch, who brings in coffee with a full silly walk. - -Tenuous link with curve fitting is that you generally see "two lumps" -one in your data and the other in the curve that is being fitted. - -Or alternately, if your data is not too silly then you can fit a -curve to it. - -License is GNU LGPL v3, see https://launchpad.net/twolumps -""" -import inspect -from itertools import izip - -import numpy -from wafo import stats -from scipy import mean, std - -def factory(name): - """ Factory to return appropriate objects for each distro. """ - fitters = dict( - - beta=ZeroOneScipyDistribution, - alpha=ZeroOneScipyDistribution, - ncf=ZeroOneScipyDistribution, - triang=ZeroOneScipyDistribution, - uniform=ZeroOneScipyDistribution, - powerlaw=ZeroOneScipyDistribution, - - pareto=MinLocScipyDistribution, - expon=MinLocScipyDistribution, - gamma=MinLocScipyDistribution, - lognorm=MinLocScipyDistribution, - maxwell=MinLocScipyDistribution, - weibull_min=MinLocScipyDistribution, - - weibull_max=MaxLocScipyDistribution) - - return fitters.get(name, ScipyDistribution)(name) - - -def get_continuous_distros(): - """ Find all attributes of stats that are continuous distributions. """ - - fitters = [] - skip = set() - for name, item in inspect.getmembers(stats): - if name in skip: continue - if item is stats.rv_continuous: continue - if isinstance(item, stats.rv_continuous): - fitters.append([name, factory(name)]) - - return fitters - - -class ScipyDistribution(object): - - def __init__(self, name): - - self.name = name - self.distro = self.get_distro() - self.fitted = None - - def __getattr__(self, attr): - """ Try delegating to the distro object """ - return getattr(self.distro, attr) - - def get_distro(self): - - return getattr(stats, self.name) - - def set_distro(self, parms): - - self.distro = getattr(stats, self.name)(*parms) - - return self.distro - - def calculate_loc_and_scale(self, data): - """ Calculate loc and scale parameters for fit. - - Depending on the distribution, these need to be approximately - right to get a good fit. - """ - return mean(data), std(data) - - def fit(self, data, *args, **kwargs): - """ This needs some work. - - Seems the various scipy distributions do a reasonable job if given a good hint. - - Need to get distro specific hints. - """ - - fits = [] - - # try with and without providing loc and scale hints - # increases chance of a fit without an exception being - # generated. - for (loc, scale) in ((0.0, 1.0), - self.calculate_loc_and_scale(data)): - - try: - parms = self.get_distro().fit(data, loc=loc, scale=scale) - - self.set_distro(list(parms)) - expected = self.expected(data) - rss = ((expected-data)**2).sum() - fits.append([rss, list(parms)]) - - parms = self.get_distro().fit(data, floc=loc, scale=scale) - - self.set_distro(list(parms)) - expected = self.expected(data) - rss = ((expected-data)**2).sum() - fits.append([rss, list(parms)]) - except: - pass - - # no fits means all tries raised exceptions - if not fits: - raise Exception("Exception in fit()") - - # pick the one with the smallest rss - fits.sort() - self.parms = fits[0][1] - print self.parms - - return self.set_distro(list(self.parms)) - - def expected(self, data): - """ Calculate expected values at each data point """ - if self.fitted is not None: - return self.fitted - - n = len(data) - xx = numpy.linspace(0, 1, n + 2)[1:-1] - self.fitted = self.ppf(xx) - #self.fitted = [self.ppf(x) for x in xx] - - return self.fitted - - def fit_stats(self, data): - """ Return stats on the fits - - data assumed to be sorted. - """ - n = len(data) - - dvar = numpy.var(data) - expected = self.expected(data) - evar = numpy.var(expected) - - rss = 0.0 - for expect, obs in izip(expected, data): - rss += (obs-expect) ** 2.0 - - self.rss = rss - self.dss = dvar * n - self.fss = evar * n - - def residuals(self, data): - """ Return residuals """ - expected = self.expected(data) - - return numpy.array(data) - numpy.array(expected) - - - -class MinLocScipyDistribution(ScipyDistribution): - - def calculate_loc_and_scale(self, data): - """ Set loc to min value in the data. - - Useful for weibull_min - """ - return min(data), std(data) - -class MaxLocScipyDistribution(ScipyDistribution): - - def calculate_loc_and_scale(self, data): - """ Set loc to max value in the data. - - Useful for weibull_max - """ - return max(data), std(data) - -class ZeroOneScipyDistribution(ScipyDistribution): - - def calculate_loc_and_scale(self, data): - """ Set loc and scale to move to [0, 1] interval. - - Useful for beta distribution - """ - return min(data), max(data)-min(data) - -class QuickFit(object): - """ Fit a family of distributions. - - Calculates stats on each fit. - - Option to create plots. - """ - - def __init__(self): - - self.distributions = [] - - def add_distribution(self, distribution): - """ Add a ready-prepared ScipyDistribution """ - self.distributions.append(distribution) - - def add(self, name): - """ Add a distribution by name. """ - - self.distributions.append(factory(name)) - - def fit(self, data): - """ Fit all of the distros we have """ - fitted = [] - for distro in self.distributions: - print 'fitting distro', distro.name - try: - distro.fit(data) - except: - continue - fitted.append(distro) - self.distributions = fitted - - print 'finished fitting' - - def stats(self, data): - """ Return stats on the fits """ - for dd in self.distributions: - dd.fit_stats(data) - - def get_topn(self, n): - """ Return top-n best fits. """ - data = [[x.rss, x] for x in self.distributions if numpy.isfinite(x.rss)] - data.sort() - - if not n: - n = len(data) - - return [x[1] for x in data[:n]] - - def fit_plot(self, data, topn=0, bins=20): - """ Create a plot. """ - from matplotlib import pylab as pl - - distros = self.get_topn(topn) - - xx = numpy.linspace(data.min(), data.max(), 300) - - table = [] - nparms = max(len(x.parms) for x in distros) - tcolours = [] - for dd in distros: - patch = pl.plot(xx, [dd.pdf(p) for p in xx], label='%10.2f%% %s' % (100.0*dd.rss/dd.dss, dd.name)) - row = ['', dd.name, '%10.2f%%' % (100.0*dd.rss/dd.dss,)] + ['%0.2f' % x for x in dd.parms] - while len(row) < 3 + nparms: - row.append('') - table.append(row) - tcolours.append([patch[0].get_markerfacecolor()] + ['w'] * (2+nparms)) - - # add a historgram with the data - pl.hist(data, bins=bins, normed=True) - tab = pl.table(cellText=table, cellColours=tcolours, - colLabels=['', 'Distribution', 'Res. SS/Data SS'] + ['P%d' % (x + 1,) for x in range(nparms)], - bbox=(0.0, 1.0, 1.0, 0.3)) - #loc='top')) - #pl.legend(loc=0) - tab.auto_set_font_size(False) - tab.set_fontsize(10.) - - def residual_plot(self, data, topn=0): - """ Create a residual plot. """ - from matplotlib import pylab as pl - - distros = self.get_topn(topn) - - - n = len(data) - xx = numpy.linspace(0, 1, n + 2)[1:-1] - for dd in distros: - - pl.plot(xx, dd.residuals(data), label='%10.2f%% %s' % (100.0*dd.rss/dd.dss, dd.name)) - pl.grid(True) - - def plot(self, data, topn): - """ Plot data fit and residuals """ - from matplotlib import pylab as pl - pl.axes([0.1, 0.4, 0.8, 0.4]) # leave room above the axes for the table - self.fit_plot(data, topn=topn) - - pl.axes([0.1, 0.05, 0.8, 0.3]) - self.residual_plot(data, topn=topn) - - -def read_data(infile, field): - """ Simple utility to extract a field out of a csv file. """ - import csv - - reader = csv.reader(infile) - header = reader.next() - field = header.index(field) - data = [] - for row in reader: - data.append(float(row[field])) - - return data - -if __name__ == '__main__': - - import sys - import optparse - - from matplotlib import pylab as pl - - parser = optparse.OptionParser() - parser.add_option('-d', '--distro', action='append', default=[]) - parser.add_option('-l', '--list', action='store_true', - help='List available distros') - - parser.add_option('-i', '--infile') - parser.add_option('-f', '--field', default='P/L') - - parser.add_option('-n', '--topn', type='int', default=0) - - parser.add_option('-s', '--sample', default='normal', - help='generate a sample from this distro as a test') - parser.add_option('--size', type='int', default=1000, - help='Size of sample to generate') - - - opts, args = parser.parse_args() - - if opts.list: - for name, distro in get_continuous_distros(): - print name - sys.exit() - opts.distro = ['weibull_min', 'norm'] - if not opts.distro: - opts.distro = [x[0] for x in get_continuous_distros()] - - quickfit = QuickFit() - for distro in opts.distro: - quickfit.add(distro) - - if opts.sample: - data = getattr(numpy.random, opts.sample)(size=opts.size) - else: - data = numpy.array(read_data(open(opts.infile), opts.field)) - - data.sort() - - quickfit.fit(data) - print 'doing stats' - quickfit.stats(data) - - print 'doing plot' - quickfit.plot(data, topn=opts.topn) - pl.show() - - - - - - - - - - diff --git a/wafo/stats/vonmises.py b/wafo/stats/vonmises.py deleted file mode 100644 index 753bf6b..0000000 --- a/wafo/stats/vonmises.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import division, print_function, absolute_import - -import numpy as np -import scipy.stats -from scipy.special import i0 - - -def von_mises_cdf_series(k,x,p): - x = float(x) - s = np.sin(x) - c = np.cos(x) - sn = np.sin(p*x) - cn = np.cos(p*x) - R = 0 - V = 0 - for n in range(p-1,0,-1): - sn, cn = sn*c - cn*s, cn*c + sn*s - R = 1./(2*n/k + R) - V = R*(sn/n+V) - - return 0.5+x/(2*np.pi) + V/np.pi - - -def von_mises_cdf_normalapprox(k,x,C1): - b = np.sqrt(2/np.pi)*np.exp(k)/i0(k) - z = b*np.sin(x/2.) - return scipy.stats.norm.cdf(z) - - -def von_mises_cdf(k,x): - ix = 2*np.pi*np.round(x/(2*np.pi)) - x = x-ix - k = float(k) - - # These values should give 12 decimal digits - CK = 50 - a = [28., 0.5, 100., 5.0] - C1 = 50.1 - - if k < CK: - p = int(np.ceil(a[0]+a[1]*k-a[2]/(k+a[3]))) - - F = np.clip(von_mises_cdf_series(k,x,p),0,1) - else: - F = von_mises_cdf_normalapprox(k,x,C1) - - return F+ix diff --git a/wafo/stats/vonmises_cython.pyx b/wafo/stats/vonmises_cython.pyx deleted file mode 100644 index 4c24986..0000000 --- a/wafo/stats/vonmises_cython.pyx +++ /dev/null @@ -1,76 +0,0 @@ -import numpy as np -import scipy.stats -from scipy.special import i0 -import numpy.testing -cimport numpy as np - -cdef extern from "math.h": - double cos(double theta) - double sin(double theta) - - -cdef double von_mises_cdf_series(double k,double x,unsigned int p): - cdef double s, c, sn, cn, R, V - cdef unsigned int n - s = sin(x) - c = cos(x) - sn = sin(p*x) - cn = cos(p*x) - R = 0 - V = 0 - for n in range(p-1,0,-1): - sn, cn = sn*c - cn*s, cn*c + sn*s - R = 1./(2*n/k + R) - V = R*(sn/n+V) - - return 0.5+x/(2*np.pi) + V/np.pi - -def von_mises_cdf_normalapprox(k,x,C1): - b = np.sqrt(2/np.pi)*np.exp(k)/i0(k) - z = b*np.sin(x/2.) - C = 24*k - chi = z - z**3/((C-2*z**2-16)/3.-(z**4+7/4.*z**2+167./2)/(C+C1-z**2+3))**2 - return scipy.stats.norm.cdf(z) - -cimport cython -@cython.boundscheck(False) -def von_mises_cdf(k,x): - cdef np.ndarray[double, ndim=1] temp, temp_xs, temp_ks - cdef unsigned int i, p - cdef double a1, a2, a3, a4, C1, CK - #k,x = np.broadcast_arrays(np.asarray(k),np.asarray(x)) - k = np.asarray(k) - x = np.asarray(x) - zerodim = k.ndim==0 and x.ndim==0 - - k = np.atleast_1d(k) - x = np.atleast_1d(x) - ix = np.round(x/(2*np.pi)) - x = x-ix*2*np.pi - - # These values should give 12 decimal digits - CK=50 - a1, a2, a3, a4 = [28., 0.5, 100., 5.0] - C1 = 50.1 - - bx, bk = np.broadcast_arrays(x,k) - result = np.empty(bx.shape,dtype=np.float) - - c_small_k = bk(1+a1+a2*temp_ks[i]-a3/(temp_ks[i]+a4)) - temp[i] = von_mises_cdf_series(temp_ks[i],temp_xs[i],p) - if temp[i]<0: - temp[i]=0 - elif temp[i]>1: - temp[i]=1 - result[c_small_k] = temp - result[~c_small_k] = von_mises_cdf_normalapprox(bk[~c_small_k],bx[~c_small_k],C1) - - if not zerodim: - return result+ix - else: - return (result+ix)[0]