diff --git a/pywafo/src/wafo/stats/__init__.py b/pywafo/src/wafo/stats/__init__.py index 57fccda..7d4ec85 100644 --- a/pywafo/src/wafo/stats/__init__.py +++ b/pywafo/src/wafo/stats/__init__.py @@ -1,12 +1,346 @@ """ -Statistics package in WAFO Toolbox. +========================================== +Statistical functions (:mod:`scipy.stats`) +========================================== - Readme - Readme file for module STATS in WAFO Toolbox +.. module:: scipy.stats + +This module contains a large number of probability distributions as +well as a growing library of statistical functions. + +Each included distribution is an instance of the class rv_continous: +For each given name the following methods are available: + +.. autosummary:: + :toctree: generated/ + + rv_continuous + rv_continuous.pdf + rv_continuous.logpdf + rv_continuous.cdf + rv_continuous.logcdf + rv_continuous.sf + rv_continuous.logsf + rv_continuous.ppf + rv_continuous.isf + rv_continuous.moment + rv_continuous.stats + rv_continuous.entropy + rv_continuous.fit + rv_continuous.expect + +Calling the instance as a function returns a frozen pdf whose shape, +location, and scale parameters are fixed. + +Similarly, each discrete distribution is an instance of the class +rv_discrete: + +.. autosummary:: + :toctree: generated/ + + rv_discrete + rv_discrete.rvs + rv_discrete.pmf + rv_discrete.logpmf + rv_discrete.cdf + rv_discrete.logcdf + rv_discrete.sf + rv_discrete.logsf + rv_discrete.ppf + rv_discrete.isf + rv_discrete.stats + rv_discrete.moment + rv_discrete.entropy + rv_discrete.expect + +Continuous distributions +======================== + +.. autosummary:: + :toctree: generated/ + + alpha -- Alpha + anglit -- Anglit + arcsine -- Arcsine + beta -- Beta + betaprime -- Beta Prime + bradford -- Bradford + burr -- Burr + cauchy -- Cauchy + chi -- Chi + chi2 -- Chi-squared + cosine -- Cosine + dgamma -- Double Gamma + dweibull -- Double Weibull + erlang -- Erlang + expon -- Exponential + exponweib -- Exponentiated Weibull + exponpow -- Exponential Power + f -- F (Snecdor F) + fatiguelife -- Fatigue Life (Birnbaum-Sanders) + fisk -- Fisk + foldcauchy -- Folded Cauchy + foldnorm -- Folded Normal + frechet_r -- Frechet Right Sided, Extreme Value Type II (Extreme LB) or weibull_min + frechet_l -- Frechet Left Sided, Weibull_max + genlogistic -- Generalized Logistic + genpareto -- Generalized Pareto + genexpon -- Generalized Exponential + genextreme -- Generalized Extreme Value + gausshyper -- Gauss Hypergeometric + gamma -- Gamma + gengamma -- Generalized gamma + genhalflogistic -- Generalized Half Logistic + gilbrat -- Gilbrat + gompertz -- Gompertz (Truncated Gumbel) + gumbel_r -- Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I + gumbel_l -- Left Sided Gumbel, etc. + halfcauchy -- Half Cauchy + halflogistic -- Half Logistic + halfnorm -- Half Normal + hypsecant -- Hyperbolic Secant + invgamma -- Inverse Gamma + invgauss -- Inverse Gaussian + invweibull -- Inverse Weibull + johnsonsb -- Johnson SB + johnsonsu -- Johnson SU + ksone -- Kolmogorov-Smirnov one-sided (no stats) + kstwobign -- Kolmogorov-Smirnov two-sided test for Large N (no stats) + laplace -- Laplace + logistic -- Logistic + loggamma -- Log-Gamma + loglaplace -- Log-Laplace (Log Double Exponential) + lognorm -- Log-Normal + lomax -- Lomax (Pareto of the second kind) + maxwell -- Maxwell + mielke -- Mielke's Beta-Kappa + nakagami -- Nakagami + ncx2 -- Non-central chi-squared + ncf -- Non-central F + nct -- Non-central Student's T + norm -- Normal (Gaussian) + pareto -- Pareto + pearson3 -- Pearson type III + powerlaw -- Power-function + powerlognorm -- Power log normal + powernorm -- Power normal + rdist -- R-distribution + reciprocal -- Reciprocal + rayleigh -- Rayleigh + rice -- Rice + recipinvgauss -- Reciprocal Inverse Gaussian + semicircular -- Semicircular + t -- Student's T + triang -- Triangular + truncexpon -- Truncated Exponential + truncnorm -- Truncated Normal + tukeylambda -- Tukey-Lambda + uniform -- Uniform + vonmises -- Von-Mises (Circular) + wald -- Wald + weibull_min -- Minimum Weibull (see Frechet) + weibull_max -- Maximum Weibull (see Frechet) + wrapcauchy -- Wrapped Cauchy + +Multivariate distributions +========================== + +.. autosummary:: + :toctree: generated/ + + multivariate_normal -- Multivariate normal distribution + +Discrete distributions +====================== + +.. autosummary:: + :toctree: generated/ + + bernoulli -- Bernoulli + binom -- Binomial + boltzmann -- Boltzmann (Truncated Discrete Exponential) + dlaplace -- Discrete Laplacian + geom -- Geometric + hypergeom -- Hypergeometric + logser -- Logarithmic (Log-Series, Series) + nbinom -- Negative Binomial + planck -- Planck (Discrete Exponential) + poisson -- Poisson + randint -- Discrete Uniform + skellam -- Skellam + zipf -- Zipf + +Statistical functions +===================== + +Several of these functions have a similar version in scipy.stats.mstats +which work for masked arrays. + +.. autosummary:: + :toctree: generated/ + + describe -- Descriptive statistics + gmean -- Geometric mean + hmean -- Harmonic mean + kurtosis -- Fisher or Pearson kurtosis + kurtosistest -- + mode -- Modal value + moment -- Central moment + normaltest -- + skew -- Skewness + skewtest -- + tmean -- Truncated arithmetic mean + tvar -- Truncated variance + tmin -- + tmax -- + tstd -- + tsem -- + nanmean -- Mean, ignoring NaN values + nanstd -- Standard deviation, ignoring NaN values + nanmedian -- Median, ignoring NaN values + variation -- Coefficient of variation + +.. autosummary:: + :toctree: generated/ + + cumfreq _ + histogram2 _ + histogram _ + itemfreq _ + percentileofscore _ + scoreatpercentile _ + relfreq _ + +.. autosummary:: + :toctree: generated/ + + binned_statistic -- Compute a binned statistic for a set of data. + binned_statistic_2d -- Compute a 2-D binned statistic for a set of data. + binned_statistic_dd -- Compute a d-D binned statistic for a set of data. + +.. autosummary:: + :toctree: generated/ + + obrientransform + signaltonoise + bayes_mvs + sem + zmap + zscore + +.. autosummary:: + :toctree: generated/ + + threshold + trimboth + trim1 + +.. autosummary:: + :toctree: generated/ + + f_oneway + pearsonr + spearmanr + pointbiserialr + kendalltau + linregress + +.. autosummary:: + :toctree: generated/ + + ttest_1samp + ttest_ind + ttest_rel + kstest + chisquare + power_divergence + ks_2samp + mannwhitneyu + tiecorrect + rankdata + ranksums + wilcoxon + kruskal + friedmanchisquare + +.. autosummary:: + :toctree: generated/ + + ansari + bartlett + levene + shapiro + anderson + binom_test + fligner + mood + +.. autosummary:: + :toctree: generated/ + + boxcox + boxcox_normmax + boxcox_llf + +Contingency table functions +=========================== + +.. autosummary:: + :toctree: generated/ + + chi2_contingency + contingency.expected_freq + contingency.margins + fisher_exact + +Plot-tests +========== + +.. autosummary:: + :toctree: generated/ + + ppcc_max + ppcc_plot + probplot + boxcox_normplot + + +Masked statistics functions +=========================== + +.. toctree:: + + stats.mstats + + +Univariate and multivariate kernel density estimation (:mod:`scipy.stats.kde`) +============================================================================== + +.. autosummary:: + :toctree: generated/ + + gaussian_kde + +For many more stat related functions install the software R and the +interface package rpy. """ +from __future__ import division, print_function, absolute_import from scipy.stats import * -from core import * -import distributions #@Reimport -from wafo.stats.distributions import * +from .core import * +from .stats import * +from .distributions import * +from .rv import * +from .morestats import * +from ._binned_statistic import * +from .kde import gaussian_kde +from . import mstats +from .contingency import chi2_contingency +from ._multivariate import * +from . import estimation + +#remove vonmises_cython from __all__, I don't know why it is included +__all__ = [s for s in dir() if not (s.startswith('_') or s.endswith('cython'))] +#import distributions #@Reimport +#from wafo.stats.distributions import * -import estimation \ No newline at end of file diff --git a/pywafo/src/wafo/stats/_discrete_distns.py b/pywafo/src/wafo/stats/_discrete_distns.py new file mode 100644 index 0000000..2e922f7 --- /dev/null +++ b/pywafo/src/wafo/stats/_discrete_distns.py @@ -0,0 +1,762 @@ +# +# Author: Travis Oliphant 2002-2011 with contributions from +# SciPy Developers 2004-2011 +# +from __future__ import division, print_function, absolute_import + +from scipy import special +from scipy.special import gammaln as gamln + +from numpy import floor, ceil, log, exp, sqrt, log1p, expm1, tanh, cosh, sinh + +import numpy as np +import numpy.random as mtrand + +from ._distn_infrastructure import ( + rv_discrete, _lazywhere, _ncx2_pdf, _ncx2_cdf) + +__all__ = [ + 'binom', 'bernoulli', 'nbinom', 'geom', 'hypergeom', + 'logser', 'poisson', 'planck', 'boltzmann', 'randint', + 'zipf', 'dlaplace', 'skellam' + ] + + +class binom_gen(rv_discrete): + """A binomial discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `binom` is:: + + binom.pmf(k) = choose(n, k) * p**k * (1-p)**(n-k) + + for ``k`` in ``{0, 1,..., n}``. + + `binom` takes ``n`` and ``p`` as shape parameters. + + %(example)s + + """ + def _rvs(self, n, p): + return mtrand.binomial(n, p, self._size) + + def _argcheck(self, n, p): + self.b = n + return (n >= 0) & (p >= 0) & (p <= 1) + + def _logpmf(self, x, n, p): + k = floor(x) + combiln = (gamln(n+1) - (gamln(k+1) + gamln(n-k+1))) + return combiln + special.xlogy(k, p) + special.xlog1py(n-k, -p) + + def _pmf(self, x, n, p): + return exp(self._logpmf(x, n, p)) + + def _cdf(self, x, n, p): + k = floor(x) + vals = special.bdtr(k, n, p) + return vals + + def _sf(self, x, n, p): + k = floor(x) + return special.bdtrc(k, n, p) + + def _ppf(self, q, n, p): + vals = ceil(special.bdtrik(q, n, p)) + vals1 = vals-1 + temp = special.bdtr(vals1, n, p) + return np.where(temp >= q, vals1, vals) + + def _stats(self, n, p): + q = 1.0-p + mu = n * p + var = n * p * q + g1 = (q-p) / sqrt(n*p*q) + g2 = (1.0-6*p*q)/(n*p*q) + return mu, var, g1, g2 + + def _entropy(self, n, p): + k = np.r_[0:n + 1] + vals = self._pmf(k, n, p) + h = -np.sum(special.xlogy(vals, vals), axis=0) + return h +binom = binom_gen(name='binom') + + +class bernoulli_gen(binom_gen): + """A Bernoulli discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `bernoulli` is:: + + bernoulli.pmf(k) = 1-p if k = 0 + = p if k = 1 + + for ``k`` in ``{0, 1}``. + + `bernoulli` takes ``p`` as shape parameter. + + %(example)s + + """ + def _rvs(self, p): + return binom_gen._rvs(self, 1, p) + + def _argcheck(self, p): + return (p >= 0) & (p <= 1) + + def _logpmf(self, x, p): + return binom._logpmf(x, 1, p) + + def _pmf(self, x, p): + return binom._pmf(x, 1, p) + + def _cdf(self, x, p): + return binom._cdf(x, 1, p) + + def _sf(self, x, p): + return binom._sf(x, 1, p) + + def _ppf(self, q, p): + return binom._ppf(q, 1, p) + + def _stats(self, p): + return binom._stats(1, p) + + def _entropy(self, p): + h = -special.xlogy(p, p) - special.xlogy(1 - p, 1 - p) + return h +bernoulli = bernoulli_gen(b=1, name='bernoulli') + + +class nbinom_gen(rv_discrete): + """A negative binomial discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `nbinom` is:: + + nbinom.pmf(k) = choose(k+n-1, n-1) * p**n * (1-p)**k + + for ``k >= 0``. + + `nbinom` takes ``n`` and ``p`` as shape parameters. + + %(example)s + + """ + def _rvs(self, n, p): + return mtrand.negative_binomial(n, p, self._size) + + def _argcheck(self, n, p): + return (n >= 0) & (p >= 0) & (p <= 1) + + def _pmf(self, x, n, p): + return exp(self._logpmf(x, n, p)) + + def _logpmf(self, x, n, p): + coeff = gamln(n+x) - gamln(x+1) - gamln(n) + return coeff + n*log(p) + x*log1p(-p) + + def _cdf(self, x, n, p): + k = floor(x) + return special.betainc(n, k+1, p) + + def _sf_skip(self, x, n, p): + # skip because special.nbdtrc doesn't work for 0= q, vals1, vals) + + def _stats(self, n, p): + Q = 1.0 / p + P = Q - 1.0 + mu = n*P + var = n*P*Q + g1 = (Q+P)/sqrt(n*P*Q) + g2 = (1.0 + 6*P*Q) / (n*P*Q) + return mu, var, g1, g2 +nbinom = nbinom_gen(name='nbinom') + + +class geom_gen(rv_discrete): + """A geometric discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `geom` is:: + + geom.pmf(k) = (1-p)**(k-1)*p + + for ``k >= 1``. + + `geom` takes ``p`` as shape parameter. + + %(example)s + + """ + def _rvs(self, p): + return mtrand.geometric(p, size=self._size) + + def _argcheck(self, p): + return (p <= 1) & (p >= 0) + + def _pmf(self, k, p): + return np.power(1-p, k-1) * p + + def _logpmf(self, k, p): + return (k-1)*log1p(-p) + log(p) + + def _cdf(self, x, p): + k = floor(x) + return -expm1(log1p(-p)*k) + + def _sf(self, x, p): + return np.exp(self._logsf(x, p)) + + def _logsf(self, x, p): + k = floor(x) + return k*log1p(-p) + + def _ppf(self, q, p): + vals = ceil(log1p(-q)/log1p(-p)) + temp = self._cdf(vals-1, p) + return np.where((temp >= q) & (vals > 0), vals-1, vals) + + def _stats(self, p): + mu = 1.0/p + qr = 1.0-p + var = qr / p / p + g1 = (2.0-p) / sqrt(qr) + g2 = np.polyval([1, -6, 6], p)/(1.0-p) + return mu, var, g1, g2 +geom = geom_gen(a=1, name='geom', longname="A geometric") + + +class hypergeom_gen(rv_discrete): + """A hypergeometric discrete random variable. + + The hypergeometric distribution models drawing objects from a bin. + M is the total number of objects, n is total number of Type I objects. + The random variate represents the number of Type I objects in N drawn + without replacement from the total population. + + %(before_notes)s + + Notes + ----- + The probability mass function is defined as:: + + pmf(k, M, n, N) = choose(n, k) * choose(M - n, N - k) / choose(M, N), + for max(0, N - (M-n)) <= k <= min(n, N) + + Examples + -------- + >>> from scipy.stats import hypergeom + + Suppose we have a collection of 20 animals, of which 7 are dogs. Then if + we want to know the probability of finding a given number of dogs if we + choose at random 12 of the 20 animals, we can initialize a frozen + distribution and plot the probability mass function: + + >>> [M, n, N] = [20, 7, 12] + >>> rv = hypergeom(M, n, N) + >>> x = np.arange(0, n+1) + >>> pmf_dogs = rv.pmf(x) + + >>> fig = plt.figure() + >>> ax = fig.add_subplot(111) + >>> ax.plot(x, pmf_dogs, 'bo') + >>> ax.vlines(x, 0, pmf_dogs, lw=2) + >>> ax.set_xlabel('# of dogs in our group of chosen animals') + >>> ax.set_ylabel('hypergeom PMF') + >>> plt.show() + + Instead of using a frozen distribution we can also use `hypergeom` + methods directly. To for example obtain the cumulative distribution + function, use: + + >>> prb = hypergeom.cdf(x, M, n, N) + + And to generate random numbers: + + >>> R = hypergeom.rvs(M, n, N, size=10) + + """ + def _rvs(self, M, n, N): + return mtrand.hypergeometric(n, M-n, N, size=self._size) + + def _argcheck(self, M, n, N): + cond = rv_discrete._argcheck(self, M, n, N) + cond &= (n <= M) & (N <= M) + self.a = max(N-(M-n), 0) + self.b = min(n, N) + return cond + + def _logpmf(self, k, M, n, N): + tot, good = M, n + bad = tot - good + return gamln(good+1) - gamln(good-k+1) - gamln(k+1) + gamln(bad+1) \ + - gamln(bad-N+k+1) - gamln(N-k+1) - gamln(tot+1) + gamln(tot-N+1) \ + + gamln(N+1) + + def _pmf(self, k, M, n, N): + # same as the following but numerically more precise + # return comb(good, k) * comb(bad, N-k) / comb(tot, N) + return exp(self._logpmf(k, M, n, N)) + + def _stats(self, M, n, N): + # tot, good, sample_size = M, n, N + # "wikipedia".replace('N', 'M').replace('n', 'N').replace('K', 'n') + M, n, N = 1.*M, 1.*n, 1.*N + m = M - n + p = n/M + mu = N*p + + var = m*n*N*(M - N)*1.0/(M*M*(M-1)) + g1 = (m - n)*(M-2*N) / (M-2.0) * sqrt((M-1.0) / (m*n*N*(M-N))) + + g2 = M*(M+1) - 6.*N*(M-N) - 6.*n*m + g2 *= (M-1)*M*M + g2 += 6.*n*N*(M-N)*m*(5.*M-6) + g2 /= n * N * (M-N) * m * (M-2.) * (M-3.) + return mu, var, g1, g2 + + def _entropy(self, M, n, N): + k = np.r_[N - (M - n):min(n, N) + 1] + vals = self.pmf(k, M, n, N) + h = -np.sum(special.xlogy(vals, vals), axis=0) + return h + + def _sf(self, k, M, n, N): + """More precise calculation, 1 - cdf doesn't cut it.""" + # This for loop is needed because `k` can be an array. If that's the + # case, the sf() method makes M, n and N arrays of the same shape. We + # therefore unpack all inputs args, so we can do the manual + # integration. + res = [] + for quant, tot, good, draw in zip(k, M, n, N): + # Manual integration over probability mass function. More accurate + # than integrate.quad. + k2 = np.arange(quant + 1, draw + 1) + res.append(np.sum(self._pmf(k2, tot, good, draw))) + return np.asarray(res) +hypergeom = hypergeom_gen(name='hypergeom') + + +# FIXME: Fails _cdfvec +class logser_gen(rv_discrete): + """A Logarithmic (Log-Series, Series) discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `logser` is:: + + logser.pmf(k) = - p**k / (k*log(1-p)) + + for ``k >= 1``. + + `logser` takes ``p`` as shape parameter. + + %(example)s + + """ + def _rvs(self, p): + # looks wrong for p>0.5, too few k=1 + # trying to use generic is worse, no k=1 at all + return mtrand.logseries(p, size=self._size) + + def _argcheck(self, p): + return (p > 0) & (p < 1) + + def _pmf(self, k, p): + return -np.power(p, k) * 1.0 / k / log1p(- p) + + def _stats(self, p): + r = log1p(-p) + mu = p / (p - 1.0) / r + mu2p = -p / r / (p - 1.0)**2 + var = mu2p - mu*mu + mu3p = -p / r * (1.0+p) / (1.0 - p)**3 + mu3 = mu3p - 3*mu*mu2p + 2*mu**3 + g1 = mu3 / np.power(var, 1.5) + + mu4p = -p / r * ( + 1.0 / (p-1)**2 - 6*p / (p - 1)**3 + 6*p*p / (p-1)**4) + mu4 = mu4p - 4*mu3p*mu + 6*mu2p*mu*mu - 3*mu**4 + g2 = mu4 / var**2 - 3.0 + return mu, var, g1, g2 +logser = logser_gen(a=1, name='logser', longname='A logarithmic') + + +class poisson_gen(rv_discrete): + """A Poisson discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `poisson` is:: + + poisson.pmf(k) = exp(-mu) * mu**k / k! + + for ``k >= 0``. + + `poisson` takes ``mu`` as shape parameter. + + %(example)s + + """ + def _rvs(self, mu): + return mtrand.poisson(mu, self._size) + + def _logpmf(self, k, mu): + Pk = k*log(mu)-gamln(k+1) - mu + return Pk + + def _pmf(self, k, mu): + return exp(self._logpmf(k, mu)) + + def _cdf(self, x, mu): + k = floor(x) + return special.pdtr(k, mu) + + def _sf(self, x, mu): + k = floor(x) + return special.pdtrc(k, mu) + + def _ppf(self, q, mu): + vals = ceil(special.pdtrik(q, mu)) + vals1 = vals - 1 + temp = special.pdtr(vals1, mu) + return np.where((temp >= q), vals1, vals) + + def _stats(self, mu): + var = mu + tmp = np.asarray(mu) + g1 = sqrt(1.0 / tmp) + g2 = 1.0 / tmp + return mu, var, g1, g2 +poisson = poisson_gen(name="poisson", longname='A Poisson') + + +class planck_gen(rv_discrete): + """A Planck discrete exponential random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `planck` is:: + + planck.pmf(k) = (1-exp(-lambda_))*exp(-lambda_*k) + + for ``k*lambda_ >= 0``. + + `planck` takes ``lambda_`` as shape parameter. + + %(example)s + + """ + def _argcheck(self, lambda_): + if (lambda_ > 0): + self.a = 0 + self.b = np.inf + return 1 + elif (lambda_ < 0): + self.a = -np.inf + self.b = 0 + return 1 + else: + return 0 + + def _pmf(self, k, lambda_): + fact = -expm1(-lambda_) + return fact * exp(-lambda_ * k) + + def _cdf(self, x, lambda_): + k = floor(x) + return - expm1(-lambda_ * (k + 1)) + + def _ppf(self, q, lambda_): + vals = ceil(-1.0/lambda_ * log1p(-q)-1) + vals1 = (vals-1).clip(self.a, np.inf) + temp = self._cdf(vals1, lambda_) + return np.where(temp >= q, vals1, vals) + + def _stats(self, lambda_): + mu = 1/(exp(lambda_)-1) + var = exp(-lambda_)/(expm1(-lambda_))**2 + g1 = 2*cosh(lambda_/2.0) + g2 = 4+2*cosh(lambda_) + return mu, var, g1, g2 + + def _entropy(self, lambda_): + l = lambda_ + C = -expm1(-l) + return l * exp(-l) / C - log(C) +planck = planck_gen(name='planck', longname='A discrete exponential ') + + +class boltzmann_gen(rv_discrete): + """A Boltzmann (Truncated Discrete Exponential) random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `boltzmann` is:: + + boltzmann.pmf(k) = (1-exp(-lambda_)*exp(-lambda_*k)/(1-exp(-lambda_*N)) + + for ``k = 0,..., N-1``. + + `boltzmann` takes ``lambda_`` and ``N`` as shape parameters. + + %(example)s + + """ + def _pmf(self, k, lambda_, N): + fact = (expm1(-lambda_))/(expm1(-lambda_*N)) + return fact*exp(-lambda_*k) + + def _cdf(self, x, lambda_, N): + k = floor(x) + return (expm1(-lambda_*(k+1)))/(expm1(-lambda_*N)) + + def _ppf(self, q, lambda_, N): + qnew = -q*(expm1(-lambda_*N)) + vals = ceil(-1.0/lambda_ * log1p(-qnew)-1) + vals1 = (vals-1).clip(0.0, np.inf) + temp = self._cdf(vals1, lambda_, N) + return np.where(temp >= q, vals1, vals) + + def _stats(self, lambda_, N): + z = exp(-lambda_) + zN = exp(-lambda_*N) + mu = z/(1.0-z)-N*zN/(1-zN) + var = z/(1.0-z)**2 - N*N*zN/(1-zN)**2 + trm = (1-zN)/(1-z) + trm2 = (z*trm**2 - N*N*zN) + g1 = z*(1+z)*trm**3 - N**3*zN*(1+zN) + g1 = g1 / trm2**(1.5) + g2 = z*(1+4*z+z*z)*trm**4 - N**4 * zN*(1+4*zN+zN*zN) + g2 = g2 / trm2 / trm2 + return mu, var, g1, g2 +boltzmann = boltzmann_gen(name='boltzmann', + longname='A truncated discrete exponential ') + + +class randint_gen(rv_discrete): + """A uniform discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `randint` is:: + + randint.pmf(k) = 1./(high - low) + + for ``k = low, ..., high - 1``. + + `randint` takes ``low`` and ``high`` as shape parameters. + + Note the difference to the numpy ``random_integers`` which + returns integers on a *closed* interval ``[low, high]``. + + %(example)s + + """ + def _argcheck(self, low, high): + self.a = low + self.b = high - 1 + return (high > low) + + def _pmf(self, k, low, high): + p = np.ones_like(k) / (high - low) + return np.where((k >= low) & (k < high), p, 0.) + + def _cdf(self, x, low, high): + k = floor(x) + return (k - low + 1.) / (high - low) + + def _ppf(self, q, low, high): + vals = ceil(q * (high - low) + low) - 1 + vals1 = (vals - 1).clip(low, high) + temp = self._cdf(vals1, low, high) + return np.where(temp >= q, vals1, vals) + + def _stats(self, low, high): + m2, m1 = np.asarray(high), np.asarray(low) + mu = (m2 + m1 - 1.0) / 2 + d = m2 - m1 + var = (d*d - 1) / 12.0 + g1 = 0.0 + g2 = -6.0/5.0 * (d*d + 1.0) / (d*d - 1.0) + return mu, var, g1, g2 + + def _rvs(self, low, high=None): + """An array of *size* random integers >= ``low`` and < ``high``. + + If ``high`` is ``None``, then range is >=0 and < low + """ + return mtrand.randint(low, high, self._size) + + def _entropy(self, low, high): + return log(high - low) +randint = randint_gen(name='randint', longname='A discrete uniform ' + '(random integer)') + + +# FIXME: problems sampling. +class zipf_gen(rv_discrete): + """A Zipf discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `zipf` is:: + + zipf.pmf(k, a) = 1/(zeta(a) * k**a) + + for ``k >= 1``. + + `zipf` takes ``a`` as shape parameter. + + %(example)s + + """ + def _rvs(self, a): + return mtrand.zipf(a, size=self._size) + + def _argcheck(self, a): + return a > 1 + + def _pmf(self, k, a): + Pk = 1.0 / special.zeta(a, 1) / k**a + return Pk + + def _munp(self, n, a): + return _lazywhere( + a > n + 1, (a, n), + lambda a, n: special.zeta(a - n, 1) / special.zeta(a, 1), + np.inf) +zipf = zipf_gen(a=1, name='zipf', longname='A Zipf') + + +class dlaplace_gen(rv_discrete): + """A Laplacian discrete random variable. + + %(before_notes)s + + Notes + ----- + The probability mass function for `dlaplace` is:: + + dlaplace.pmf(k) = tanh(a/2) * exp(-a*abs(k)) + + for ``a > 0``. + + `dlaplace` takes ``a`` as shape parameter. + + %(example)s + + """ + def _pmf(self, k, a): + return tanh(a/2.0) * exp(-a * abs(k)) + + def _cdf(self, x, a): + k = floor(x) + f = lambda k, a: 1.0 - exp(-a * k) / (exp(a) + 1) + f2 = lambda k, a: exp(a * (k+1)) / (exp(a) + 1) + return _lazywhere(k >= 0, (k, a), f=f, f2=f2) + + def _ppf(self, q, a): + const = 1 + exp(a) + vals = ceil(np.where(q < 1.0 / (1 + exp(-a)), log(q*const) / a - 1, + -log((1-q) * const) / a)) + vals1 = vals - 1 + return np.where(self._cdf(vals1, a) >= q, vals1, vals) + + def _stats(self, a): + ea = exp(a) + mu2 = 2.*ea/(ea-1.)**2 + mu4 = 2.*ea*(ea**2+10.*ea+1.) / (ea-1.)**4 + return 0., mu2, 0., mu4/mu2**2 - 3. + + def _entropy(self, a): + return a / sinh(a) - log(tanh(a/2.0)) +dlaplace = dlaplace_gen(a=-np.inf, + name='dlaplace', longname='A discrete Laplacian') + + +class skellam_gen(rv_discrete): + """A Skellam discrete random variable. + + %(before_notes)s + + Notes + ----- + Probability distribution of the difference of two correlated or + uncorrelated Poisson random variables. + + Let k1 and k2 be two Poisson-distributed r.v. with expected values + lam1 and lam2. Then, ``k1 - k2`` follows a Skellam distribution with + parameters ``mu1 = lam1 - rho*sqrt(lam1*lam2)`` and + ``mu2 = lam2 - rho*sqrt(lam1*lam2)``, where rho is the correlation + coefficient between k1 and k2. If the two Poisson-distributed r.v. + are independent then ``rho = 0``. + + Parameters mu1 and mu2 must be strictly positive. + + For details see: http://en.wikipedia.org/wiki/Skellam_distribution + + `skellam` takes ``mu1`` and ``mu2`` as shape parameters. + + %(example)s + + """ + def _rvs(self, mu1, mu2): + n = self._size + return mtrand.poisson(mu1, n) - mtrand.poisson(mu2, n) + + def _pmf(self, x, mu1, mu2): + px = np.where(x < 0, + _ncx2_pdf(2*mu2, 2*(1-x), 2*mu1)*2, + _ncx2_pdf(2*mu1, 2*(1+x), 2*mu2)*2) + # ncx2.pdf() returns nan's for extremely low probabilities + return px + + def _cdf(self, x, mu1, mu2): + x = floor(x) + px = np.where(x < 0, + _ncx2_cdf(2*mu2, -2*x, 2*mu1), + 1-_ncx2_cdf(2*mu1, 2*(x+1), 2*mu2)) + return px + + def _stats(self, mu1, mu2): + mean = mu1 - mu2 + var = mu1 + mu2 + g1 = mean / sqrt((var)**3) + g2 = 1 / var + return mean, var, g1, g2 +skellam = skellam_gen(a=-np.inf, name="skellam", longname='A Skellam') diff --git a/pywafo/src/wafo/stats/_distn_infrastructure.py b/pywafo/src/wafo/stats/_distn_infrastructure.py new file mode 100644 index 0000000..aad95b4 --- /dev/null +++ b/pywafo/src/wafo/stats/_distn_infrastructure.py @@ -0,0 +1,3530 @@ +# +# Author: Travis Oliphant 2002-2011 with contributions from +# SciPy Developers 2004-2011 +# +from __future__ import division, print_function, absolute_import + +from scipy.lib.six import string_types, exec_ + +import sys +import keyword +import re +import inspect +import types +import warnings +from scipy.misc import doccer + +from scipy.special import xlogy, chndtr, gammaln, hyp0f1 + +# for root finding for discrete distribution ppf, and max likelihood estimation +from scipy import optimize + +# for functions of continuous distributions (e.g. moments, entropy, cdf) +from scipy import integrate + +# to approximate the pdf of a continuous distribution given its cdf +from scipy.misc import comb, derivative + +from numpy import (arange, putmask, ravel, take, ones, sum, shape, + product, reshape, zeros, floor, logical_and, log, sqrt, exp, + ndarray, newaxis) + +from numpy import (place, any, argsort, argmax, vectorize, + asarray, nan, inf, isinf, NINF, empty) + +import numpy as np +import numpy.random as mtrand + +from ._constants import _EPS, _XMAX +from .estimation import FitDistribution + +try: + from new import instancemethod +except ImportError: + # Python 3 + def instancemethod(func, obj, cls): + return types.MethodType(func, obj) + + +# These are the docstring parts used for substitution in specific +# distribution docstrings + +docheaders = {'methods': """\nMethods\n-------\n""", + 'parameters': """\nParameters\n---------\n""", + 'notes': """\nNotes\n-----\n""", + 'examples': """\nExamples\n--------\n"""} + +_doc_rvs = """\ +rvs(%(shapes)s, loc=0, scale=1, size=1) + Random variates. +""" +_doc_pdf = """\ +pdf(x, %(shapes)s, loc=0, scale=1) + Probability density function. +""" +_doc_logpdf = """\ +logpdf(x, %(shapes)s, loc=0, scale=1) + Log of the probability density function. +""" +_doc_pmf = """\ +pmf(x, %(shapes)s, loc=0, scale=1) + Probability mass function. +""" +_doc_logpmf = """\ +logpmf(x, %(shapes)s, loc=0, scale=1) + Log of the probability mass function. +""" +_doc_cdf = """\ +cdf(x, %(shapes)s, loc=0, scale=1) + Cumulative density function. +""" +_doc_logcdf = """\ +logcdf(x, %(shapes)s, loc=0, scale=1) + Log of the cumulative density function. +""" +_doc_sf = """\ +sf(x, %(shapes)s, loc=0, scale=1) + Survival function (1-cdf --- sometimes more accurate). +""" +_doc_logsf = """\ +logsf(x, %(shapes)s, loc=0, scale=1) + Log of the survival function. +""" +_doc_ppf = """\ +ppf(q, %(shapes)s, loc=0, scale=1) + Percent point function (inverse of cdf --- percentiles). +""" +_doc_isf = """\ +isf(q, %(shapes)s, loc=0, scale=1) + Inverse survival function (inverse of sf). +""" +_doc_moment = """\ +moment(n, %(shapes)s, loc=0, scale=1) + Non-central moment of order n +""" +_doc_stats = """\ +stats(%(shapes)s, loc=0, scale=1, moments='mv') + Mean('m'), variance('v'), skew('s'), and/or kurtosis('k'). +""" +_doc_entropy = """\ +entropy(%(shapes)s, loc=0, scale=1) + (Differential) entropy of the RV. +""" +_doc_fit = """\ +fit(data, %(shapes)s, loc=0, scale=1) + Parameter estimates for generic data. +""" +_doc_expect = """\ +expect(func, %(shapes)s, loc=0, scale=1, lb=None, ub=None, conditional=False, **kwds) + Expected value of a function (of one argument) with respect to the distribution. +""" +_doc_expect_discrete = """\ +expect(func, %(shapes)s, loc=0, lb=None, ub=None, conditional=False) + Expected value of a function (of one argument) with respect to the distribution. +""" +_doc_median = """\ +median(%(shapes)s, loc=0, scale=1) + Median of the distribution. +""" +_doc_mean = """\ +mean(%(shapes)s, loc=0, scale=1) + Mean of the distribution. +""" +_doc_var = """\ +var(%(shapes)s, loc=0, scale=1) + Variance of the distribution. +""" +_doc_std = """\ +std(%(shapes)s, loc=0, scale=1) + Standard deviation of the distribution. +""" +_doc_interval = """\ +interval(alpha, %(shapes)s, loc=0, scale=1) + Endpoints of the range that contains alpha percent of the distribution +""" +_doc_allmethods = ''.join([docheaders['methods'], _doc_rvs, _doc_pdf, + _doc_logpdf, _doc_cdf, _doc_logcdf, _doc_sf, + _doc_logsf, _doc_ppf, _doc_isf, _doc_moment, + _doc_stats, _doc_entropy, _doc_fit, + _doc_expect, _doc_median, + _doc_mean, _doc_var, _doc_std, _doc_interval]) + +# Note that the two lines for %(shapes) are searched for and replaced in +# rv_continuous and rv_discrete - update there if the exact string changes +_doc_default_callparams = """\ +Parameters +---------- +x : array_like + quantiles +q : array_like + lower or upper tail probability +%(shapes)s : array_like + shape parameters +loc : array_like, optional + location parameter (default=0) +scale : array_like, optional + scale parameter (default=1) +size : int or tuple of ints, optional + shape of random variates (default computed from input arguments ) +moments : str, optional + composed of letters ['mvsk'] specifying which moments to compute where + 'm' = mean, 'v' = variance, 's' = (Fisher's) skew and + 'k' = (Fisher's) kurtosis. (default='mv') +""" +_doc_default_longsummary = """\ +Continuous random variables are defined from a standard form and may +require some shape parameters to complete its specification. Any +optional keyword parameters can be passed to the methods of the RV +object as given below: +""" +_doc_default_frozen_note = """ +Alternatively, the object may be called (as a function) to fix the shape, +location, and scale parameters returning a "frozen" continuous RV object: + +rv = %(name)s(%(shapes)s, loc=0, scale=1) + - Frozen RV object with the same methods but holding the given shape, + location, and scale fixed. +""" +_doc_default_example = """\ +Examples +-------- +>>> import matplotlib.pyplot as plt +>>> from wafo.stats import %(name)s +>>> numargs = %(name)s.numargs +>>> [ %(shapes)s ] = [0.9,] * numargs +>>> rv = %(name)s(%(shapes)s) + +Display frozen pdf + +>>> x = np.linspace(0, np.minimum(rv.dist.b, 3)) +>>> h = plt.plot(x, rv.pdf(x)) + +Here, ``rv.dist.b`` is the right endpoint of the support of ``rv.dist``. + +Check accuracy of cdf and ppf + +>>> prb = %(name)s.cdf(x, %(shapes)s) +>>> h = plt.semilogy(np.abs(x - %(name)s.ppf(prb, %(shapes)s)) + 1e-20) + +Random number generation + +>>> R = %(name)s.rvs(%(shapes)s, size=100) + +Compare ML and MPS method +>>> phat = %(name)s.fit2(R, method='ml'); +>>> phat.plotfitsummary(); plt.figure(plt.gcf().number+1) +>>> phat2 = %(name)s.fit2(R, method='mps') +>>> phat2.plotfitsummary(); plt.figure(plt.gcf().number+1) + +Fix loc=0 and estimate shapes and scale +>>> phat3 = %(name)s.fit2(R, scale=1, floc=0, method='mps') +>>> phat3.plotfitsummary(); plt.figure(plt.gcf().number+1) + +Accurate confidence interval with profile loglikelihood +>>> lp = phat3.profile() +>>> lp.plot() +>>> pci = lp.get_bounds() + +""" + +_doc_default = ''.join([_doc_default_longsummary, + _doc_allmethods, + _doc_default_callparams, + _doc_default_frozen_note, + _doc_default_example]) + +_doc_default_before_notes = ''.join([_doc_default_longsummary, + _doc_allmethods, + _doc_default_callparams, + _doc_default_frozen_note]) + +docdict = { + 'rvs': _doc_rvs, + 'pdf': _doc_pdf, + 'logpdf': _doc_logpdf, + 'cdf': _doc_cdf, + 'logcdf': _doc_logcdf, + 'sf': _doc_sf, + 'logsf': _doc_logsf, + 'ppf': _doc_ppf, + 'isf': _doc_isf, + 'stats': _doc_stats, + 'entropy': _doc_entropy, + 'fit': _doc_fit, + 'moment': _doc_moment, + 'expect': _doc_expect, + 'interval': _doc_interval, + 'mean': _doc_mean, + 'std': _doc_std, + 'var': _doc_var, + 'median': _doc_median, + 'allmethods': _doc_allmethods, + 'callparams': _doc_default_callparams, + 'longsummary': _doc_default_longsummary, + 'frozennote': _doc_default_frozen_note, + 'example': _doc_default_example, + 'default': _doc_default, + 'before_notes': _doc_default_before_notes +} + +# Reuse common content between continuous and discrete docs, change some +# minor bits. +docdict_discrete = docdict.copy() + +docdict_discrete['pmf'] = _doc_pmf +docdict_discrete['logpmf'] = _doc_logpmf +docdict_discrete['expect'] = _doc_expect_discrete +_doc_disc_methods = ['rvs', 'pmf', 'logpmf', 'cdf', 'logcdf', 'sf', 'logsf', + 'ppf', 'isf', 'stats', 'entropy', 'expect', 'median', + 'mean', 'var', 'std', 'interval', + 'fit'] +for obj in _doc_disc_methods: + docdict_discrete[obj] = docdict_discrete[obj].replace(', scale=1', '') +docdict_discrete.pop('pdf') +docdict_discrete.pop('logpdf') + +_doc_allmethods = ''.join([docdict_discrete[obj] for obj in _doc_disc_methods]) +docdict_discrete['allmethods'] = docheaders['methods'] + _doc_allmethods + +docdict_discrete['longsummary'] = _doc_default_longsummary.replace( + 'Continuous', 'Discrete') +_doc_default_frozen_note = """ +Alternatively, the object may be called (as a function) to fix the shape and +location parameters returning a "frozen" discrete RV object: + +rv = %(name)s(%(shapes)s, loc=0) + - Frozen RV object with the same methods but holding the given shape and + location fixed. +""" +docdict_discrete['frozennote'] = _doc_default_frozen_note + +_doc_default_discrete_example = """\ +Examples +-------- +>>> from scipy.stats import %(name)s +>>> [ %(shapes)s ] = [] +>>> rv = %(name)s(%(shapes)s) + +Display frozen pmf + +>>> x = np.arange(0, np.minimum(rv.dist.b, 3)) +>>> h = plt.vlines(x, 0, rv.pmf(x), lw=2) + +Here, ``rv.dist.b`` is the right endpoint of the support of ``rv.dist``. + +Check accuracy of cdf and ppf + +>>> prb = %(name)s.cdf(x, %(shapes)s) +>>> h = plt.semilogy(np.abs(x - %(name)s.ppf(prb, %(shapes)s)) + 1e-20) + +Random number generation + +>>> R = %(name)s.rvs(%(shapes)s, size=100) + +""" +docdict_discrete['example'] = _doc_default_discrete_example + +_doc_default_before_notes = ''.join([docdict_discrete['longsummary'], + docdict_discrete['allmethods'], + docdict_discrete['callparams'], + docdict_discrete['frozennote']]) +docdict_discrete['before_notes'] = _doc_default_before_notes + +_doc_default_disc = ''.join([docdict_discrete['longsummary'], + docdict_discrete['allmethods'], + docdict_discrete['frozennote'], + docdict_discrete['example']]) +docdict_discrete['default'] = _doc_default_disc + + +# clean up all the separate docstring elements, we do not need them anymore +for obj in [s for s in dir() if s.startswith('_doc_')]: + exec('del ' + obj) +del obj +try: + del s +except NameError: + # in Python 3, loop variables are not visible after the loop + pass + + +def _moment(data, n, mu=None): + if mu is None: + mu = data.mean() + return ((data - mu)**n).mean() + + +def _moment_from_stats(n, mu, mu2, g1, g2, moment_func, args): + if (n == 0): + return 1.0 + elif (n == 1): + if mu is None: + val = moment_func(1, *args) + else: + val = mu + elif (n == 2): + if mu2 is None or mu is None: + val = moment_func(2, *args) + else: + val = mu2 + mu*mu + elif (n == 3): + if g1 is None or mu2 is None or mu is None: + val = moment_func(3, *args) + else: + mu3 = g1 * np.power(mu2, 1.5) # 3rd central moment + val = mu3+3*mu*mu2+mu*mu*mu # 3rd non-central moment + elif (n == 4): + if g1 is None or g2 is None or mu2 is None or mu is None: + val = moment_func(4, *args) + else: + mu4 = (g2+3.0)*(mu2**2.0) # 4th central moment + mu3 = g1*np.power(mu2, 1.5) # 3rd central moment + val = mu4+4*mu*mu3+6*mu*mu*mu2+mu*mu*mu*mu + else: + val = moment_func(n, *args) + + return val + + +def _skew(data): + """ + skew is third central moment / variance**(1.5) + """ + data = np.ravel(data) + mu = data.mean() + m2 = ((data - mu)**2).mean() + m3 = ((data - mu)**3).mean() + return m3 / np.power(m2, 1.5) + + +def _kurtosis(data): + """ + kurtosis is fourth central moment / variance**2 - 3 + """ + data = np.ravel(data) + mu = data.mean() + m2 = ((data - mu)**2).mean() + m4 = ((data - mu)**4).mean() + return m4 / m2**2 - 3 + + +# Frozen RV class +class rv_frozen(object): + ''' Frozen continous or discrete 1D Random Variable object (RV) + + Methods + ------- + RV.rvs(size=1) + - random variates + + RV.pdf(x) + - probability density function (continous case) + + RV.pmf(x) + - probability mass function (discrete case) + + RV.cdf(x) + - cumulative density function + + RV.sf(x) + - survival function (1-cdf --- sometimes more accurate) + + RV.ppf(q) + - percent point function (inverse of cdf --- percentiles) + + RV.isf(q) + - inverse survival function (inverse of sf) + + RV.stats(moments='mv') + - mean('m'), variance('v'), skew('s'), and/or kurtosis('k') + + RV.entropy() + - (differential) entropy of the RV. + + Parameters + ---------- + x : array-like + quantiles + q : array-like + lower or upper tail probability + size : int or tuple of ints, optional, keyword + shape of random variates + moments : string, optional, keyword + one or more of 'm' mean, 'v' variance, 's' skewness, 'k' kurtosis + ''' + def __init__(self, dist, *args, **kwds): + self.dist = dist + args, loc, scale = dist._parse_args(*args, **kwds) + if len(args) == dist.numargs - 2: # isinstance(dist, rv_continuous): + self.par = args + (loc, scale) + else: # rv_discrete + self.par = args + (loc,) + + def pdf(self, x): + ''' Probability density function at x of the given RV.''' + return self.dist.pdf(x, *self.par) + + def logpdf(self, x): + return self.dist.logpdf(x, *self.par) + + def cdf(self, x): + '''Cumulative distribution function at x of the given RV.''' + return self.dist.cdf(x, *self.par) + + def logcdf(self, x): + return self.dist.logcdf(x, *self.par) + + def ppf(self, q): + '''Percent point function (inverse of cdf) at q of the given RV.''' + return self.dist.ppf(q, *self.par) + + def isf(self, q): + '''Inverse survival function at q of the given RV.''' + return self.dist.isf(q, *self.par) + + def rvs(self, size=None): + '''Random variates of given type.''' + kwds = dict(size=size) + return self.dist.rvs(*self.par, **kwds) + + def sf(self, x): + '''Survival function (1-cdf) at x of the given RV.''' + return self.dist.sf(x, *self.par) + + def logsf(self, x): + return self.dist.logsf(x, *self.par) + + def stats(self, moments='mv'): + ''' Some statistics of the given RV''' + kwds = dict(moments=moments) + return self.dist.stats(*self.par, **kwds) + + def median(self): + return self.dist.median(*self.par) + + def mean(self): + return self.dist.mean(*self.par) + + def var(self): + return self.dist.var(*self.par) + + def std(self): + return self.dist.std(*self.par) + + def moment(self, n): + return self.dist.moment(n, *self.par) + + def entropy(self): + return self.dist.entropy(*self.par) + + def pmf(self, k): + '''Probability mass function at k of the given RV''' + return self.dist.pmf(k, *self.par) + + def logpmf(self, k): + return self.dist.logpmf(k, *self.par) + + def interval(self, alpha): + return self.dist.interval(alpha, *self.par) + + +# Frozen RV class +class rv_frozen_old(object): + def __init__(self, dist, *args, **kwds): + self.args = args + self.kwds = kwds + self.dist = dist + + def pdf(self, x): # raises AttributeError in frozen discrete distribution + return self.dist.pdf(x, *self.args, **self.kwds) + + def logpdf(self, x): + return self.dist.logpdf(x, *self.args, **self.kwds) + + def cdf(self, x): + return self.dist.cdf(x, *self.args, **self.kwds) + + def logcdf(self, x): + return self.dist.logcdf(x, *self.args, **self.kwds) + + def ppf(self, q): + return self.dist.ppf(q, *self.args, **self.kwds) + + def isf(self, q): + return self.dist.isf(q, *self.args, **self.kwds) + + def rvs(self, size=None): + kwds = self.kwds.copy() + kwds.update({'size': size}) + return self.dist.rvs(*self.args, **kwds) + + def sf(self, x): + return self.dist.sf(x, *self.args, **self.kwds) + + def logsf(self, x): + return self.dist.logsf(x, *self.args, **self.kwds) + + def stats(self, moments='mv'): + kwds = self.kwds.copy() + kwds.update({'moments': moments}) + return self.dist.stats(*self.args, **kwds) + + def median(self): + return self.dist.median(*self.args, **self.kwds) + + def mean(self): + return self.dist.mean(*self.args, **self.kwds) + + def var(self): + return self.dist.var(*self.args, **self.kwds) + + def std(self): + return self.dist.std(*self.args, **self.kwds) + + def moment(self, n): + return self.dist.moment(n, *self.args, **self.kwds) + + def entropy(self): + return self.dist.entropy(*self.args, **self.kwds) + + def pmf(self, k): + return self.dist.pmf(k, *self.args, **self.kwds) + + def logpmf(self, k): + return self.dist.logpmf(k, *self.args, **self.kwds) + + def interval(self, alpha): + return self.dist.interval(alpha, *self.args, **self.kwds) + + +def valarray(shape, value=nan, typecode=None): + """Return an array of all value. + """ + + out = ones(shape, dtype=bool) * value + if typecode is not None: + out = out.astype(typecode) + if not isinstance(out, ndarray): + out = asarray(out) + return out + + +def _lazywhere(cond, arrays, f, fillvalue=None, f2=None): + """ + np.where(cond, x, fillvalue) always evaluates x even where cond is False. + This one only evaluates f(arr1[cond], arr2[cond], ...). + For example, + >>> a, b = np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]) + >>> def f(a, b): + return a*b + >>> _lazywhere(a > 2, (a, b), f, np.nan) + array([ nan, nan, 21., 32.]) + + Notice it assumes that all `arrays` are of the same shape, or can be + broadcasted together. + + """ + if fillvalue is None: + if f2 is None: + raise ValueError("One of (fillvalue, f2) must be given.") + else: + fillvalue = np.nan + else: + if f2 is not None: + raise ValueError("Only one of (fillvalue, f2) can be given.") + + arrays = np.broadcast_arrays(*arrays) + temp = tuple(np.extract(cond, arr) for arr in arrays) + out = valarray(shape(arrays[0]), value=fillvalue) + np.place(out, cond, f(*temp)) + if f2 is not None: + temp = tuple(np.extract(~cond, arr) for arr in arrays) + np.place(out, ~cond, f2(*temp)) + + return out + + +# This should be rewritten +def argsreduce(cond, *args): + """Return the sequence of ravel(args[i]) where ravel(condition) is + True in 1D. + + Examples + -------- + >>> import numpy as np + >>> rand = np.random.random_sample + >>> A = rand((4, 5)) + >>> B = 2 + >>> C = rand((1, 5)) + >>> cond = np.ones(A.shape) + >>> [A1, B1, C1] = argsreduce(cond, A, B, C) + >>> B1.shape + (20,) + >>> cond[2,:] = 0 + >>> [A2, B2, C2] = argsreduce(cond, A, B, C) + >>> B2.shape + (15,) + + """ + newargs = np.atleast_1d(*args) + if not isinstance(newargs, list): + newargs = [newargs, ] + expand_arr = (cond == cond) + return [np.extract(cond, arr1 * expand_arr) for arr1 in newargs] + + +parse_arg_template = """ +def _parse_args(self, %(shape_arg_str)s %(locscale_in)s): + return (%(shape_arg_str)s), %(locscale_out)s + +def _parse_args_rvs(self, %(shape_arg_str)s %(locscale_in)s, size=None): + return (%(shape_arg_str)s), %(locscale_out)s, size + +def _parse_args_stats(self, %(shape_arg_str)s %(locscale_in)s, moments='mv'): + return (%(shape_arg_str)s), %(locscale_out)s, moments +""" + + +# Both the continuous and discrete distributions depend on ncx2. +# I think the function name ncx2 is an abbreviation for noncentral chi squared. + +def _ncx2_log_pdf(x, df, nc): + a = asarray(df/2.0) + fac = -nc/2.0 - x/2.0 + (a-1)*log(x) - a*log(2) - gammaln(a) + return fac + np.nan_to_num(log(hyp0f1(a, nc * x/4.0))) + +def _ncx2_pdf(x, df, nc): + return np.exp(_ncx2_log_pdf(x, df, nc)) + +def _ncx2_cdf(x, df, nc): + return chndtr(x, df, nc) + + +class rv_generic(object): + """Class which encapsulates common functionality between rv_discrete + and rv_continuous. + + """ + def __init__(self): + super(rv_generic, self).__init__() + + # figure out if _stats signature has 'moments' keyword + sign = inspect.getargspec(self._stats) + self._stats_has_moments = ((sign[2] is not None) or + ('moments' in sign[0])) + + def _construct_argparser(self, meths_to_inspect, locscale_in, locscale_out): + """Construct the parser for the shape arguments. + + Generates the argument-parsing functions dynamically and attaches + them to the instance. + Is supposed to be called in __init__ of a class for each distribution. + + If self.shapes is a non-empty string, interprets it as a + comma-separated list of shape parameters. + + Otherwise inspects the call signatures of `meths_to_inspect` + and constructs the argument-parsing functions from these. + In this case also sets `shapes` and `numargs`. + """ + + if self.shapes: + # sanitize the user-supplied shapes + if not isinstance(self.shapes, string_types): + raise TypeError('shapes must be a string.') + + shapes = self.shapes.replace(',', ' ').split() + + for field in shapes: + if keyword.iskeyword(field): + raise SyntaxError('keywords cannot be used as shapes.') + if not re.match('^[_a-zA-Z][_a-zA-Z0-9]*$', field): + raise SyntaxError( + 'shapes must be valid python identifiers') + else: + # find out the call signatures (_pdf, _cdf etc), deduce shape + # arguments + shapes_list = [] + for meth in meths_to_inspect: + shapes_args = inspect.getargspec(meth) + shapes_list.append(shapes_args.args) + + # *args or **kwargs are not allowed w/automatic shapes + # (generic methods have 'self, x' only) + if len(shapes_args.args) > 2: + if shapes_args.varargs is not None: + raise TypeError( + '*args are not allowed w/out explicit shapes') + if shapes_args.keywords is not None: + raise TypeError( + '**kwds are not allowed w/out explicit shapes') + if shapes_args.defaults is not None: + raise TypeError('defaults are not allowed for shapes') + + shapes = max(shapes_list, key=lambda x: len(x)) + shapes = shapes[2:] # remove self, x, + + # make sure the signatures are consistent + # (generic methods have 'self, x' only) + for item in shapes_list: + if len(item) > 2 and item[2:] != shapes: + raise TypeError('Shape arguments are inconsistent.') + + # have the arguments, construct the method from template + shapes_str = ', '.join(shapes) + ', ' if shapes else '' # NB: not None + dct = dict(shape_arg_str=shapes_str, + locscale_in=locscale_in, + locscale_out=locscale_out, + ) + ns = {} + exec_(parse_arg_template % dct, ns) + # NB: attach to the instance, not class + for name in ['_parse_args', '_parse_args_stats', '_parse_args_rvs']: + setattr(self, name, + instancemethod(ns[name], self, self.__class__) + ) + + self.shapes = ', '.join(shapes) if shapes else None + if not hasattr(self, 'numargs'): + # allows more general subclassing with *args + self.numargs = len(shapes) + + def freeze(self, *args, **kwds): + """Freeze the distribution for the given arguments. + + Parameters + ---------- + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution. Should include all + the non-optional arguments, may include ``loc`` and ``scale``. + + Returns + ------- + rv_frozen : rv_frozen instance + The frozen distribution. + + """ + return rv_frozen(self, *args, **kwds) + + def __call__(self, *args, **kwds): + return self.freeze(*args, **kwds) + + # The actual calculation functions (no basic checking need be done) + # If these are defined, the others won't be looked at. + # Otherwise, the other set can be defined. + def _stats(self, *args, **kwds): + return None, None, None, None + + # Central moments + def _munp(self, n, *args): + # Silence floating point warnings from integration. + olderr = np.seterr(all='ignore') + vals = self.generic_moment(n, *args) + np.seterr(**olderr) + return vals + + ## These are the methods you must define (standard form functions) + ## NB: generic _pdf, _logpdf, _cdf are different for + ## rv_continuous and rv_discrete hence are defined in there + def _argcheck(self, *args): + """Default check for correct values on args and keywords. + + Returns condition array of 1's where arguments are correct and + 0's where they are not. + + """ + cond = 1 + for arg in args: + cond = logical_and(cond, (asarray(arg) > 0)) + return cond + + ##(return 1-d using self._size to get number) + def _rvs(self, *args): + ## Use basic inverse cdf algorithm for RV generation as default. + U = mtrand.sample(self._size) + Y = self._ppf(U, *args) + return Y + + def _logcdf(self, x, *args): + return log(self._cdf(x, *args)) + + def _sf(self, x, *args): + return 1.0-self._cdf(x, *args) + + def _logsf(self, x, *args): + return log(self._sf(x, *args)) + + def _ppf(self, q, *args): + return self._ppfvec(q, *args) + + def _isf(self, q, *args): + return self._ppf(1.0-q, *args) # use correct _ppf for subclasses + + # These are actually called, and should not be overwritten if you + # want to keep error checking. + def rvs(self, *args, **kwds): + """ + Random variates of given type. + + Parameters + ---------- + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + loc : array_like, optional + Location parameter (default=0). + scale : array_like, optional + Scale parameter (default=1). + size : int or tuple of ints, optional + Defining number of random variates (default=1). + + Returns + ------- + rvs : ndarray or scalar + Random variates of given `size`. + + """ + discrete = kwds.pop('discrete', None) + args, loc, scale, size = self._parse_args_rvs(*args, **kwds) + cond = logical_and(self._argcheck(*args), (scale >= 0)) + if not np.all(cond): + raise ValueError("Domain error in arguments.") + + # self._size is total size of all output values + self._size = product(size, axis=0) + if self._size is not None and self._size > 1: + size = np.array(size, ndmin=1) + + if np.all(scale == 0): + return loc*ones(size, 'd') + + vals = self._rvs(*args) + if self._size is not None: + vals = reshape(vals, size) + + vals = vals * scale + loc + + # Cast to int if discrete + if discrete: + if np.isscalar(vals): + vals = int(vals) + else: + vals = vals.astype(int) + + return vals + + def stats(self, *args, **kwds): + """ + Some statistics of the given RV + + Parameters + ---------- + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + location parameter (default=0) + scale : array_like, optional (discrete RVs only) + scale parameter (default=1) + moments : str, optional + composed of letters ['mvsk'] defining which moments to compute: + 'm' = mean, + 'v' = variance, + 's' = (Fisher's) skew, + 'k' = (Fisher's) kurtosis. + (default='mv') + + Returns + ------- + stats : sequence + of requested moments. + + """ + args, loc, scale, moments = self._parse_args_stats(*args, **kwds) + # scale = 1 by construction for discrete RVs + loc, scale = map(asarray, (loc, scale)) + args = tuple(map(asarray, args)) + cond = self._argcheck(*args) & (scale > 0) & (loc == loc) + output = [] + default = valarray(shape(cond), self.badvalue) + + # Use only entries that are valid in calculation + if any(cond): + goodargs = argsreduce(cond, *(args+(scale, loc))) + scale, loc, goodargs = goodargs[-2], goodargs[-1], goodargs[:-2] + + if self._stats_has_moments: + mu, mu2, g1, g2 = self._stats(*goodargs, + **{'moments': moments}) + else: + mu, mu2, g1, g2 = self._stats(*goodargs) + if g1 is None: + mu3 = None + else: + if mu2 is None: + mu2 = self._munp(2, *goodargs) + # (mu2**1.5) breaks down for nan and inf + mu3 = g1 * np.power(mu2, 1.5) + + if 'm' in moments: + if mu is None: + mu = self._munp(1, *goodargs) + out0 = default.copy() + place(out0, cond, mu * scale + loc) + output.append(out0) + + if 'v' in moments: + if mu2 is None: + mu2p = self._munp(2, *goodargs) + if mu is None: + mu = self._munp(1, *goodargs) + mu2 = mu2p - mu * mu + if np.isinf(mu): + #if mean is inf then var is also inf + mu2 = np.inf + out0 = default.copy() + place(out0, cond, mu2 * scale * scale) + output.append(out0) + + if 's' in moments: + if g1 is None: + mu3p = self._munp(3, *goodargs) + if mu is None: + mu = self._munp(1, *goodargs) + if mu2 is None: + mu2p = self._munp(2, *goodargs) + mu2 = mu2p - mu * mu + mu3 = mu3p - 3 * mu * mu2 - mu**3 + g1 = mu3 / np.power(mu2, 1.5) + out0 = default.copy() + place(out0, cond, g1) + output.append(out0) + + if 'k' in moments: + if g2 is None: + mu4p = self._munp(4, *goodargs) + if mu is None: + mu = self._munp(1, *goodargs) + if mu2 is None: + mu2p = self._munp(2, *goodargs) + mu2 = mu2p - mu * mu + if mu3 is None: + mu3p = self._munp(3, *goodargs) + mu3 = mu3p - 3 * mu * mu2 - mu**3 + mu4 = mu4p - 4 * mu * mu3 - 6 * mu * mu * mu2 - mu**4 + g2 = mu4 / mu2**2.0 - 3.0 + out0 = default.copy() + place(out0, cond, g2) + output.append(out0) + else: # no valid args + output = [] + for _ in moments: + out0 = default.copy() + output.append(out0) + + if len(output) == 1: + return output[0] + else: + return tuple(output) + + def entropy(self, *args, **kwds): + """ + Differential entropy of the RV. + + Parameters + ---------- + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + loc : array_like, optional + Location parameter (default=0). + scale : array_like, optional (continuous distributions only). + Scale parameter (default=1). + + Notes + ----- + Entropy is defined base `e`: + + >>> drv = rv_discrete(values=((0, 1), (0.5, 0.5))) + >>> np.allclose(drv.entropy(), np.log(2.0)) + True + + """ + args, loc, scale = self._parse_args(*args, **kwds) + # NB: for discrete distributions scale=1 by construction in _parse_args + args = tuple(map(asarray, args)) + cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc) + output = zeros(shape(cond0), 'd') + place(output, (1-cond0), self.badvalue) + goodargs = argsreduce(cond0, *args) + # I don't know when or why vecentropy got broken when numargs == 0 + # 09.08.2013: is this still relevant? cf check_vecentropy test + # in tests/test_continuous_basic.py + if self.numargs == 0: + place(output, cond0, self._entropy() + log(scale)) + else: + place(output, cond0, self.vecentropy(*goodargs) + log(scale)) + return output + + def moment(self, n, *args, **kwds): + """ + n'th order non-central moment of distribution. + + Parameters + ---------- + n : int, n>=1 + Order of moment. + arg1, arg2, arg3,... : float + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + kwds : keyword arguments, optional + These can include "loc" and "scale", as well as other keyword + arguments relevant for a given distribution. + + """ + args, loc, scale = self._parse_args(*args, **kwds) + if not (self._argcheck(*args) and (scale > 0)): + return nan + if (floor(n) != n): + raise ValueError("Moment must be an integer.") + if (n < 0): + raise ValueError("Moment must be positive.") + mu, mu2, g1, g2 = None, None, None, None + if (n > 0) and (n < 5): + if self._stats_has_moments: + mdict = {'moments': {1: 'm', 2: 'v', 3: 'vs', 4: 'vk'}[n]} + else: + mdict = {} + mu, mu2, g1, g2 = self._stats(*args, **mdict) + val = _moment_from_stats(n, mu, mu2, g1, g2, self._munp, args) + + # Convert to transformed X = L + S*Y + # E[X^n] = E[(L+S*Y)^n] = L^n sum(comb(n, k)*(S/L)^k E[Y^k], k=0...n) + if loc == 0: + return scale**n * val + else: + result = 0 + fac = float(scale) / float(loc) + for k in range(n): + valk = _moment_from_stats(k, mu, mu2, g1, g2, self._munp, args) + result += comb(n, k, exact=True)*(fac**k) * valk + result += fac**n * val + return result * loc**n + + def median(self, *args, **kwds): + """ + Median of the distribution. + + Parameters + ---------- + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + Location parameter, Default is 0. + scale : array_like, optional + Scale parameter, Default is 1. + + Returns + ------- + median : float + The median of the distribution. + + See Also + -------- + stats.distributions.rv_discrete.ppf + Inverse of the CDF + + """ + return self.ppf(0.5, *args, **kwds) + + def mean(self, *args, **kwds): + """ + Mean of the distribution + + Parameters + ---------- + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + location parameter (default=0) + scale : array_like, optional + scale parameter (default=1) + + Returns + ------- + mean : float + the mean of the distribution + """ + kwds['moments'] = 'm' + res = self.stats(*args, **kwds) + if isinstance(res, ndarray) and res.ndim == 0: + return res[()] + return res + + def var(self, *args, **kwds): + """ + Variance of the distribution + + Parameters + ---------- + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + location parameter (default=0) + scale : array_like, optional + scale parameter (default=1) + + Returns + ------- + var : float + the variance of the distribution + + """ + kwds['moments'] = 'v' + res = self.stats(*args, **kwds) + if isinstance(res, ndarray) and res.ndim == 0: + return res[()] + return res + + def std(self, *args, **kwds): + """ + Standard deviation of the distribution. + + Parameters + ---------- + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + location parameter (default=0) + scale : array_like, optional + scale parameter (default=1) + + Returns + ------- + std : float + standard deviation of the distribution + + """ + kwds['moments'] = 'v' + res = sqrt(self.stats(*args, **kwds)) + return res + + def interval(self, alpha, *args, **kwds): + """ + Confidence interval with equal areas around the median. + + Parameters + ---------- + alpha : array_like of float + Probability that an rv will be drawn from the returned range. + Each value should be in the range [0, 1]. + arg1, arg2, ... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + loc : array_like, optional + location parameter, Default is 0. + scale : array_like, optional + scale parameter, Default is 1. + + Returns + ------- + a, b : ndarray of float + end-points of range that contain ``100 * alpha %`` of the rv's + possible values. + + """ + alpha = asarray(alpha) + if any((alpha > 1) | (alpha < 0)): + raise ValueError("alpha must be between 0 and 1 inclusive") + q1 = (1.0-alpha)/2 + q2 = (1.0+alpha)/2 + a = self.ppf(q1, *args, **kwds) + b = self.ppf(q2, *args, **kwds) + return a, b + + +## continuous random variables: implement maybe later +## +## hf --- Hazard Function (PDF / SF) +## chf --- Cumulative hazard function (-log(SF)) +## psf --- Probability sparsity function (reciprocal of the pdf) in +## units of percent-point-function (as a function of q). +## Also, the derivative of the percent-point function. + +class rv_continuous(rv_generic): + """ + A generic continuous random variable class meant for subclassing. + + `rv_continuous` is a base class to construct specific distribution classes + and instances from for continuous random variables. It cannot be used + directly as a distribution. + + Parameters + ---------- + momtype : int, optional + The type of generic moment calculation to use: 0 for pdf, 1 (default) + for ppf. + a : float, optional + Lower bound of the support of the distribution, default is minus + infinity. + b : float, optional + Upper bound of the support of the distribution, default is plus + infinity. + xtol : float, optional + The tolerance for fixed point calculation for generic ppf. + badvalue : object, optional + The value in a result arrays that indicates a value that for which + some argument restriction is violated, default is np.nan. + name : str, optional + The name of the instance. This string is used to construct the default + example for distributions. + longname : str, optional + This string is used as part of the first line of the docstring returned + when a subclass has no docstring of its own. Note: `longname` exists + for backwards compatibility, do not use for new subclasses. + shapes : str, optional + The shape of the distribution. For example ``"m, n"`` for a + distribution that takes two integers as the two shape arguments for all + its methods. + extradoc : str, optional, deprecated + This string is used as the last part of the docstring returned when a + subclass has no docstring of its own. Note: `extradoc` exists for + backwards compatibility, do not use for new subclasses. + + Methods + ------- + rvs(, loc=0, scale=1, size=1) + random variates + + pdf(x, , loc=0, scale=1) + probability density function + + logpdf(x, , loc=0, scale=1) + log of the probability density function + + cdf(x, , loc=0, scale=1) + cumulative density function + + logcdf(x, , loc=0, scale=1) + log of the cumulative density function + + sf(x, , loc=0, scale=1) + survival function (1-cdf --- sometimes more accurate) + + logsf(x, , loc=0, scale=1) + log of the survival function + + ppf(q, , loc=0, scale=1) + percent point function (inverse of cdf --- quantiles) + + isf(q, , loc=0, scale=1) + inverse survival function (inverse of sf) + + moment(n, , loc=0, scale=1) + non-central n-th moment of the distribution. May not work for array + arguments. + + stats(, loc=0, scale=1, moments='mv') + mean('m'), variance('v'), skew('s'), and/or kurtosis('k') + + entropy(, loc=0, scale=1) + (differential) entropy of the RV. + + fit(data, , loc=0, scale=1) + Parameter estimates for generic data + + expect(func=None, args=(), loc=0, scale=1, lb=None, ub=None, + conditional=False, **kwds) + Expected value of a function with respect to the distribution. + Additional kwd arguments passed to integrate.quad + + median(, loc=0, scale=1) + Median of the distribution. + + mean(, loc=0, scale=1) + Mean of the distribution. + + std(, loc=0, scale=1) + Standard deviation of the distribution. + + var(, loc=0, scale=1) + Variance of the distribution. + + interval(alpha, , loc=0, scale=1) + Interval that with `alpha` percent probability contains a random + realization of this distribution. + + __call__(, loc=0, scale=1) + Calling a distribution instance creates a frozen RV object with the + same methods but holding the given shape, location, and scale fixed. + See Notes section. + + **Parameters for Methods** + + x : array_like + quantiles + q : array_like + lower or upper tail probability + : array_like + shape parameters + loc : array_like, optional + location parameter (default=0) + scale : array_like, optional + scale parameter (default=1) + size : int or tuple of ints, optional + shape of random variates (default computed from input arguments ) + moments : string, optional + composed of letters ['mvsk'] specifying which moments to compute where + 'm' = mean, 'v' = variance, 's' = (Fisher's) skew and + 'k' = (Fisher's) kurtosis. (default='mv') + n : int + order of moment to calculate in method moments + + Notes + ----- + + **Methods that can be overwritten by subclasses** + :: + + _rvs + _pdf + _cdf + _sf + _ppf + _isf + _stats + _munp + _entropy + _argcheck + + There are additional (internal and private) generic methods that can + be useful for cross-checking and for debugging, but might work in all + cases when directly called. + + **Frozen Distribution** + + Alternatively, the object may be called (as a function) to fix the shape, + location, and scale parameters returning a "frozen" continuous RV object: + + rv = generic(, loc=0, scale=1) + frozen RV object with the same methods but holding the given shape, + location, and scale fixed + + **Subclassing** + + New random variables can be defined by subclassing rv_continuous class + and re-defining at least the ``_pdf`` or the ``_cdf`` method (normalized + to location 0 and scale 1) which will be given clean arguments (in between + a and b) and passing the argument check method. + + If positive argument checking is not correct for your RV + then you will also need to re-define the ``_argcheck`` method. + + Correct, but potentially slow defaults exist for the remaining + methods but for speed and/or accuracy you can over-ride:: + + _logpdf, _cdf, _logcdf, _ppf, _rvs, _isf, _sf, _logsf + + Rarely would you override ``_isf``, ``_sf`` or ``_logsf``, but you could. + + Statistics are computed using numerical integration by default. + For speed you can redefine this using ``_stats``: + + - take shape parameters and return mu, mu2, g1, g2 + - If you can't compute one of these, return it as None + - Can also be defined with a keyword argument ``moments=``, + where is a string composed of 'm', 'v', 's', + and/or 'k'. Only the components appearing in string + should be computed and returned in the order 'm', 'v', + 's', or 'k' with missing values returned as None. + + Alternatively, you can override ``_munp``, which takes n and shape + parameters and returns the nth non-central moment of the distribution. + + A note on ``shapes``: subclasses need not specify them explicitly. In this + case, the `shapes` will be automatically deduced from the signatures of the + overridden methods. + If, for some reason, you prefer to avoid relying on introspection, you can + specify ``shapes`` explicitly as an argument to the instance constructor. + + Examples + -------- + To create a new Gaussian distribution, we would do the following:: + + class gaussian_gen(rv_continuous): + "Gaussian distribution" + def _pdf(self, x): + ... + ... + + """ + + def __init__(self, momtype=1, a=None, b=None, xtol=1e-14, + badvalue=None, name=None, longname=None, + shapes=None, extradoc=None): + + super(rv_continuous, self).__init__() + + if badvalue is None: + badvalue = nan + if name is None: + name = 'Distribution' + self.badvalue = badvalue + self.name = name + self.a = a + self.b = b + if a is None: + self.a = -inf + if b is None: + self.b = inf + self.xtol = xtol + self._size = 1 + self.m = 0.0 + self.moment_type = momtype + + self.expandarr = 1 + + self.shapes = shapes + self._construct_argparser(meths_to_inspect=[self._pdf, self._cdf], + locscale_in='loc=0, scale=1', + locscale_out='loc, scale') + + # nin correction + self._ppfvec = vectorize(self._ppf_single, otypes='d') + self._ppfvec.nin = self.numargs + 1 + self.vecentropy = vectorize(self._entropy, otypes='d') + self.vecentropy.nin = self.numargs + 1 + self._cdfvec = vectorize(self._cdf_single, otypes='d') + self._cdfvec.nin = self.numargs + 1 + + # backwards compatibility + self.vecfunc = self._ppfvec + self.veccdf = self._cdfvec + + self.extradoc = extradoc + if momtype == 0: + self.generic_moment = vectorize(self._mom0_sc, otypes='d') + else: + self.generic_moment = vectorize(self._mom1_sc, otypes='d') + # Because of the *args argument of _mom0_sc, vectorize cannot count the + # number of arguments correctly. + self.generic_moment.nin = self.numargs + 1 + + if longname is None: + if name[0] in ['aeiouAEIOU']: + hstr = "An " + else: + hstr = "A " + longname = hstr + name + + if sys.flags.optimize < 2: + # Skip adding docstrings if interpreter is run with -OO + if self.__doc__ is None: + self._construct_default_doc(longname=longname, + extradoc=extradoc) + else: + self._construct_doc() + + def _construct_default_doc(self, longname=None, extradoc=None): + """Construct instance docstring from the default template.""" + if longname is None: + longname = 'A' + if extradoc is None: + extradoc = '' + if extradoc.startswith('\n\n'): + extradoc = extradoc[2:] + self.__doc__ = ''.join(['%s continuous random variable.' % longname, + '\n\n%(before_notes)s\n', docheaders['notes'], + extradoc, '\n%(example)s']) + self._construct_doc() + + def _construct_doc(self): + """Construct the instance docstring with string substitutions.""" + tempdict = docdict.copy() + tempdict['name'] = self.name or 'distname' + tempdict['shapes'] = self.shapes or '' + + if self.shapes is None: + # remove shapes from call parameters if there are none + for item in ['callparams', 'default', 'before_notes']: + tempdict[item] = tempdict[item].replace( + "\n%(shapes)s : array_like\n shape parameters", "") + for i in range(2): + if self.shapes is None: + # necessary because we use %(shapes)s in two forms (w w/o ", ") + self.__doc__ = self.__doc__.replace("%(shapes)s, ", "") + self.__doc__ = doccer.docformat(self.__doc__, tempdict) + + def _ppf_to_solve(self, x, q, *args): + return self.cdf(*(x, )+args)-q + + def _ppf_single(self, q, *args): + left = right = None + if self.a > -np.inf: + left = self.a + if self.b < np.inf: + right = self.b + + factor = 10. + if not left: # i.e. self.a = -inf + left = -1.*factor + while self._ppf_to_solve(left, q, *args) > 0.: + right = left + left *= factor + # left is now such that cdf(left) < q + if not right: # i.e. self.b = inf + right = factor + while self._ppf_to_solve(right, q, *args) < 0.: + left = right + right *= factor + # right is now such that cdf(right) > q + + return optimize.brentq(self._ppf_to_solve, + left, right, args=(q,)+args, xtol=self.xtol) + + # moment from definition + def _mom_integ0(self, x, m, *args): + return x**m * self.pdf(x, *args) + + def _mom0_sc(self, m, *args): + return integrate.quad(self._mom_integ0, self.a, self.b, + args=(m,)+args)[0] + + # moment calculated using ppf + def _mom_integ1(self, q, m, *args): + return (self.ppf(q, *args))**m + + def _mom1_sc(self, m, *args): + return integrate.quad(self._mom_integ1, 0, 1, args=(m,)+args)[0] + + def _pdf(self, x, *args): + return derivative(self._cdf, x, dx=1e-5, args=args, order=5) + + ## Could also define any of these + def _logpdf(self, x, *args): + return log(self._pdf(x, *args)) + + def _cdf_single(self, x, *args): + return integrate.quad(self._pdf, self.a, x, args=args)[0] + + def _cdf(self, x, *args): + return self._cdfvec(x, *args) + + ## generic _argcheck, _logcdf, _sf, _logsf, _ppf, _isf, _rvs are defined + ## in rv_generic + + def pdf(self, x, *args, **kwds): + """ + Probability density function at x of the given RV. + + Parameters + ---------- + x : array_like + quantiles + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + location parameter (default=0) + scale : array_like, optional + scale parameter (default=1) + + Returns + ------- + pdf : ndarray + Probability density function evaluated at x + + """ + args, loc, scale = self._parse_args(*args, **kwds) + x, loc, scale = map(asarray, (x, loc, scale)) + args = tuple(map(asarray, args)) + x = asarray((x-loc)*1.0/scale) + cond0 = self._argcheck(*args) & (scale > 0) + cond1 = (scale > 0) & (x >= self.a) & (x <= self.b) + cond = cond0 & cond1 + output = zeros(shape(cond), 'd') + putmask(output, (1-cond0)+np.isnan(x), self.badvalue) + if any(cond): + goodargs = argsreduce(cond, *((x,)+args+(scale,))) + scale, goodargs = goodargs[-1], goodargs[:-1] + place(output, cond, self._pdf(*goodargs) / scale) + if output.ndim == 0: + return output[()] + return output + + def logpdf(self, x, *args, **kwds): + """ + Log of the probability density function at x of the given RV. + + This uses a more numerically accurate calculation if available. + + Parameters + ---------- + x : array_like + quantiles + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + location parameter (default=0) + scale : array_like, optional + scale parameter (default=1) + + Returns + ------- + logpdf : array_like + Log of the probability density function evaluated at x + + """ + args, loc, scale = self._parse_args(*args, **kwds) + x, loc, scale = map(asarray, (x, loc, scale)) + args = tuple(map(asarray, args)) + x = asarray((x-loc)*1.0/scale) + cond0 = self._argcheck(*args) & (scale > 0) + cond1 = (scale > 0) & (x >= self.a) & (x <= self.b) + cond = cond0 & cond1 + output = empty(shape(cond), 'd') + output.fill(NINF) + putmask(output, (1-cond0)+np.isnan(x), self.badvalue) + if any(cond): + goodargs = argsreduce(cond, *((x,)+args+(scale,))) + scale, goodargs = goodargs[-1], goodargs[:-1] + place(output, cond, self._logpdf(*goodargs) - log(scale)) + if output.ndim == 0: + return output[()] + return output + + def cdf(self, x, *args, **kwds): + """ + Cumulative distribution function of the given RV. + + Parameters + ---------- + x : array_like + quantiles + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + location parameter (default=0) + scale : array_like, optional + scale parameter (default=1) + + Returns + ------- + cdf : ndarray + Cumulative distribution function evaluated at `x` + + """ + args, loc, scale = self._parse_args(*args, **kwds) + x, loc, scale = map(asarray, (x, loc, scale)) + args = tuple(map(asarray, args)) + x = (x-loc)*1.0/scale + cond0 = self._argcheck(*args) & (scale > 0) + cond1 = (scale > 0) & (x > self.a) & (x < self.b) + cond2 = (x >= self.b) & cond0 + cond = cond0 & cond1 + output = zeros(shape(cond), 'd') + place(output, (1-cond0)+np.isnan(x), self.badvalue) + place(output, cond2, 1.0) + if any(cond): # call only if at least 1 entry + goodargs = argsreduce(cond, *((x,)+args)) + place(output, cond, self._cdf(*goodargs)) + if output.ndim == 0: + return output[()] + return output + + def logcdf(self, x, *args, **kwds): + """ + Log of the cumulative distribution function at x of the given RV. + + Parameters + ---------- + x : array_like + quantiles + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + location parameter (default=0) + scale : array_like, optional + scale parameter (default=1) + + Returns + ------- + logcdf : array_like + Log of the cumulative distribution function evaluated at x + + """ + args, loc, scale = self._parse_args(*args, **kwds) + x, loc, scale = map(asarray, (x, loc, scale)) + args = tuple(map(asarray, args)) + x = (x-loc)*1.0/scale + cond0 = self._argcheck(*args) & (scale > 0) + cond1 = (scale > 0) & (x > self.a) & (x < self.b) + cond2 = (x >= self.b) & cond0 + cond = cond0 & cond1 + output = empty(shape(cond), 'd') + output.fill(NINF) + place(output, (1-cond0)*(cond1 == cond1)+np.isnan(x), self.badvalue) + place(output, cond2, 0.0) + if any(cond): # call only if at least 1 entry + goodargs = argsreduce(cond, *((x,)+args)) + place(output, cond, self._logcdf(*goodargs)) + if output.ndim == 0: + return output[()] + return output + + def sf(self, x, *args, **kwds): + """ + Survival function (1-cdf) at x of the given RV. + + Parameters + ---------- + x : array_like + quantiles + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + location parameter (default=0) + scale : array_like, optional + scale parameter (default=1) + + Returns + ------- + sf : array_like + Survival function evaluated at x + + """ + args, loc, scale = self._parse_args(*args, **kwds) + x, loc, scale = map(asarray, (x, loc, scale)) + args = tuple(map(asarray, args)) + x = (x-loc)*1.0/scale + cond0 = self._argcheck(*args) & (scale > 0) + cond1 = (scale > 0) & (x > self.a) & (x < self.b) + cond2 = cond0 & (x <= self.a) + cond = cond0 & cond1 + output = zeros(shape(cond), 'd') + place(output, (1-cond0)+np.isnan(x), self.badvalue) + place(output, cond2, 1.0) + if any(cond): + goodargs = argsreduce(cond, *((x,)+args)) + place(output, cond, self._sf(*goodargs)) + if output.ndim == 0: + return output[()] + return output + + def logsf(self, x, *args, **kwds): + """ + Log of the survival function of the given RV. + + Returns the log of the "survival function," defined as (1 - `cdf`), + evaluated at `x`. + + Parameters + ---------- + x : array_like + quantiles + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + location parameter (default=0) + scale : array_like, optional + scale parameter (default=1) + + Returns + ------- + logsf : ndarray + Log of the survival function evaluated at `x`. + + """ + args, loc, scale = self._parse_args(*args, **kwds) + x, loc, scale = map(asarray, (x, loc, scale)) + args = tuple(map(asarray, args)) + x = (x-loc)*1.0/scale + cond0 = self._argcheck(*args) & (scale > 0) + cond1 = (scale > 0) & (x > self.a) & (x < self.b) + cond2 = cond0 & (x <= self.a) + cond = cond0 & cond1 + output = empty(shape(cond), 'd') + output.fill(NINF) + place(output, (1-cond0)+np.isnan(x), self.badvalue) + place(output, cond2, 0.0) + if any(cond): + goodargs = argsreduce(cond, *((x,)+args)) + place(output, cond, self._logsf(*goodargs)) + if output.ndim == 0: + return output[()] + return output + + def ppf(self, q, *args, **kwds): + """ + Percent point function (inverse of cdf) at q of the given RV. + + Parameters + ---------- + q : array_like + lower tail probability + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + location parameter (default=0) + scale : array_like, optional + scale parameter (default=1) + + Returns + ------- + x : array_like + quantile corresponding to the lower tail probability q. + + """ + args, loc, scale = self._parse_args(*args, **kwds) + q, loc, scale = map(asarray, (q, loc, scale)) + args = tuple(map(asarray, args)) + cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc) + cond1 = (0 < q) & (q < 1) + cond2 = cond0 & (q == 0) + cond3 = cond0 & (q == 1) + cond = cond0 & cond1 + output = valarray(shape(cond), value=self.badvalue) + + lower_bound = self.a * scale + loc + upper_bound = self.b * scale + loc + place(output, cond2, argsreduce(cond2, lower_bound)[0]) + place(output, cond3, argsreduce(cond3, upper_bound)[0]) + + if any(cond): # call only if at least 1 entry + goodargs = argsreduce(cond, *((q,)+args+(scale, loc))) + scale, loc, goodargs = goodargs[-2], goodargs[-1], goodargs[:-2] + place(output, cond, self._ppf(*goodargs) * scale + loc) + if output.ndim == 0: + return output[()] + return output + + def isf(self, q, *args, **kwds): + """ + Inverse survival function at q of the given RV. + + Parameters + ---------- + q : array_like + upper tail probability + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + location parameter (default=0) + scale : array_like, optional + scale parameter (default=1) + + Returns + ------- + x : ndarray or scalar + Quantile corresponding to the upper tail probability q. + + """ + args, loc, scale = self._parse_args(*args, **kwds) + q, loc, scale = map(asarray, (q, loc, scale)) + args = tuple(map(asarray, args)) + cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc) + cond1 = (0 < q) & (q < 1) + cond2 = cond0 & (q == 1) + cond3 = cond0 & (q == 0) + cond = cond0 & cond1 + output = valarray(shape(cond), value=self.badvalue) + + lower_bound = self.a * scale + loc + upper_bound = self.b * scale + loc + place(output, cond2, argsreduce(cond2, lower_bound)[0]) + place(output, cond3, argsreduce(cond3, upper_bound)[0]) + + if any(cond): + goodargs = argsreduce(cond, *((q,)+args+(scale, loc))) + scale, loc, goodargs = goodargs[-2], goodargs[-1], goodargs[:-2] + place(output, cond, self._isf(*goodargs) * scale + loc) + if output.ndim == 0: + return output[()] + return output + + def link(self, x, logSF, theta, i): + ''' Return dist. par. no. i as function of quantile (x) and log survival probability (sf) + where + theta is the list containing all parameters including location and scale. + ''' + raise ValueError('Link function not implemented for the %s distribution' % self.name) + return None + + def _nnlf(self, x, *args): + return -sum(self._logpdf(x, *args), axis=0) + + def nnlf(self, theta, x): + '''Return negative loglikelihood function + + Notes + ----- + This is ``-sum(log pdf(x, theta), axis=0)`` where theta are the + parameters (including loc and scale). + ''' + try: + loc = theta[-2] + scale = theta[-1] + args = tuple(theta[:-2]) + except IndexError: + raise ValueError("Not enough input arguments.") + if not self._argcheck(*args) or scale <= 0: + return inf + x = asarray((x-loc) / scale) + cond0 = (x <= self.a) | (self.b <= x) + if (any(cond0)): + return inf + else: + N = len(x) + return self._nnlf(x, *args) + N * log(scale) + + def _penalized_nnlf(self, theta, x): + ''' Return negative loglikelihood function, + i.e., - sum (log pdf(x, theta), axis=0) + where theta are the parameters (including loc and scale) + ''' + try: + loc = theta[-2] + scale = theta[-1] + args = tuple(theta[:-2]) + except IndexError: + raise ValueError("Not enough input arguments.") + if not self._argcheck(*args) or scale <= 0: + return inf + x = asarray((x-loc) / scale) + + loginf = log(_XMAX) + + if np.isneginf(self.a).all() and np.isinf(self.b).all(): + Nbad = 0 + else: + cond0 = (x <= self.a) | (self.b <= x) + Nbad = sum(cond0) + if Nbad > 0: + x = argsreduce(~cond0, x)[0] + + N = len(x) + return self._nnlf(x, *args) + N*log(scale) + Nbad * 100.0 * loginf + + def hessian_nnlf(self, theta, data, eps=None): + ''' approximate hessian of nnlf where theta are the parameters (including loc and scale) + ''' + #Nd = len(x) + np = len(theta) + # pab 07.01.2001: Always choose the stepsize h so that + # it is an exactly representable number. + # This is important when calculating numerical derivatives and is + # accomplished by the following. + + if eps == None: + eps = (_EPS) ** 0.4 + #xmin = floatinfo.machar.xmin + #myfun = lambda y: max(y,100.0*log(xmin)) #% trick to avoid log of zero + delta = (eps + 2.0) - 2.0 + delta2 = delta ** 2.0 + # Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with + # 1/(d^2 L(theta|x)/dtheta^2) + # using central differences + + LL = self.nnlf(theta, data) + H = zeros((np, np)) #%% Hessian matrix + theta = tuple(theta) + for ix in xrange(np): + sparam = list(theta) + sparam[ix] = theta[ix] + delta + fp = self.nnlf(sparam, data) + #fp = sum(myfun(x)) + + sparam[ix] = theta[ix] - delta + fm = self.nnlf(sparam, data) + #fm = sum(myfun(x)) + + H[ix, ix] = (fp - 2 * LL + fm) / delta2 + for iy in range(ix + 1, np): + sparam[ix] = theta[ix] + delta + sparam[iy] = theta[iy] + delta + fpp = self.nnlf(sparam, data) + #fpp = sum(myfun(x)) + + sparam[iy] = theta[iy] - delta + fpm = self.nnlf(sparam, data) + #fpm = sum(myfun(x)) + + sparam[ix] = theta[ix] - delta + fmm = self.nnlf(sparam, data) + #fmm = sum(myfun(x)); + + sparam[iy] = theta[iy] + delta + fmp = self.nnlf(sparam, data) + #fmp = sum(myfun(x)) + H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2) + H[iy, ix] = H[ix, iy] + sparam[iy] = theta[iy] + + # invert the Hessian matrix (i.e. invert the observed information number) + #pcov = -pinv(H); + return - H + + def nlogps(self, theta, x): + """ Moran's negative log Product Spacings statistic + + where theta are the parameters (including loc and scale) + + Note the data in x must be sorted + + References + ----------- + + R. C. H. Cheng; N. A. K. Amin (1983) + "Estimating Parameters in Continuous Univariate Distributions with a + Shifted Origin.", + Journal of the Royal Statistical Society. Series B (Methodological), + Vol. 45, No. 3. (1983), pp. 394-403. + + R. C. H. Cheng; M. A. Stephens (1989) + "A Goodness-Of-Fit Test Using Moran's Statistic with Estimated + Parameters", Biometrika, 76, 2, pp 385-392 + + Wong, T.S.T. and Li, W.K. (2006) + "A note on the estimation of extreme value distributions using maximum + product of spacings.", + IMS Lecture Notes Monograph Series 2006, Vol. 52, pp. 272-283 + """ + + try: + loc = theta[-2] + scale = theta[-1] + args = tuple(theta[:-2]) + except IndexError: + raise ValueError("Not enough input arguments.") + if not self._argcheck(*args) or scale <= 0: + return inf + x = asarray((x - loc) / scale) + cond0 = (x <= self.a) | (self.b <= x) + Nbad = sum(cond0) + if Nbad > 0: + x = argsreduce(~cond0, x)[0] + + lowertail = True + if lowertail: + prb = np.hstack((0.0, self.cdf(x, *args), 1.0)) + dprb = np.diff(prb) + else: + prb = np.hstack((1.0, self.sf(x, *args), 0.0)) + dprb = -np.diff(prb) + + logD = log(dprb) + dx = np.diff(x, axis=0) + tie = (dx == 0) + if any(tie): + # TODO : implement this method for treating ties in data: + # Assume measuring error is delta. Then compute + # yL = F(xi-delta,theta) + # yU = F(xi+delta,theta) + # and replace + # logDj = log((yU-yL)/(r-1)) for j = i+1,i+2,...i+r-1 + + # The following is OK when only minimization of T is wanted + i_tie = np.nonzero(tie) + tiedata = x[i_tie] + logD[i_tie + 1] = log(self._pdf(tiedata, *args)) - log(scale) + + finiteD = np.isfinite(logD) + nonfiniteD = 1 - finiteD + Nbad += sum(nonfiniteD, axis=0) + if Nbad > 0: + T = -sum(logD[finiteD], axis=0) + 100.0 * log(_XMAX) * Nbad + else: + T = -sum(logD, axis=0) #Moran's negative log product spacing statistic + return T + + def hessian_nlogps(self, theta, data, eps=None): + ''' approximate hessian of nlogps where theta are the parameters (including loc and scale) + ''' + np = len(theta) + # pab 07.01.2001: Always choose the stepsize h so that + # it is an exactly representable number. + # This is important when calculating numerical derivatives and is + # accomplished by the following. + + if eps == None: + eps = (_EPS) ** 0.4 + #xmin = floatinfo.machar.xmin + #myfun = lambda y: max(y,100.0*log(xmin)) #% trick to avoid log of zero + delta = (eps + 2.0) - 2.0 + delta2 = delta ** 2.0 + # Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with + # 1/(d^2 L(theta|x)/dtheta^2) + # using central differences + + LL = self.nlogps(theta, data) + H = zeros((np, np)) # Hessian matrix + theta = tuple(theta) + for ix in xrange(np): + sparam = list(theta) + sparam[ix] = theta[ix] + delta + fp = self.nlogps(sparam, data) + #fp = sum(myfun(x)) + + sparam[ix] = theta[ix] - delta + fm = self.nlogps(sparam, data) + #fm = sum(myfun(x)) + + H[ix, ix] = (fp - 2 * LL + fm) / delta2 + for iy in range(ix + 1, np): + sparam[ix] = theta[ix] + delta + sparam[iy] = theta[iy] + delta + fpp = self.nlogps(sparam, data) + #fpp = sum(myfun(x)) + + sparam[iy] = theta[iy] - delta + fpm = self.nlogps(sparam, data) + #fpm = sum(myfun(x)) + + sparam[ix] = theta[ix] - delta + fmm = self.nlogps(sparam, data) + #fmm = sum(myfun(x)); + + sparam[iy] = theta[iy] + delta + fmp = self.nlogps(sparam, data) + #fmp = sum(myfun(x)) + H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2) + H[iy, ix] = H[ix, iy] + sparam[iy] = theta[iy]; + + # invert the Hessian matrix (i.e. invert the observed information number) + #pcov = -pinv(H); + return - H + + # return starting point for fit (shape arguments + loc + scale) + def _fitstart(self, data, args=None): + if args is None: + args = (1.0,)*self.numargs + return args + self.fit_loc_scale(data, *args) + + # Return the (possibly reduced) function to optimize in order to find MLE + # estimates for the .fit method + def _reduce_func(self, args, kwds): + args = list(args) + Nargs = len(args) + fixedn = [] + index = list(range(Nargs)) + names = ['f%d' % n for n in range(Nargs - 2)] + ['floc', 'fscale'] + x0 = [] + for n, key in zip(index, names): + if key in kwds: + fixedn.append(n) + args[n] = kwds[key] + else: + x0.append(args[n]) + method = kwds.get('method', 'ml').lower() + if method.startswith('mps'): + fitfun = self.nlogps + else: + fitfun = self._penalized_nnlf + + if len(fixedn) == 0: + func = fitfun + restore = None + else: + if len(fixedn) == len(index): + raise ValueError("All parameters fixed. There is nothing to optimize.") + + def restore(args, theta): + # Replace with theta for all numbers not in fixedn + # This allows the non-fixed values to vary, but + # we still call self.nnlf with all parameters. + i = 0 + for n in range(Nargs): + if n not in fixedn: + args[n] = theta[i] + i += 1 + return args + + def func(theta, x): + newtheta = restore(args[:], theta) + return fitfun(newtheta, x) + + return x0, func, restore, args + + def fit(self, data, *args, **kwds): + """ + Return MLEs for shape, location, and scale parameters from data. + + MLE stands for Maximum Likelihood Estimate. Starting estimates for + the fit are given by input arguments; for any arguments not provided + with starting estimates, ``self._fitstart(data)`` is called to generate + such. + + One can hold some parameters fixed to specific values by passing in + keyword arguments ``f0``, ``f1``, ..., ``fn`` (for shape parameters) + and ``floc`` and ``fscale`` (for location and scale parameters, + respectively). + + Parameters + ---------- + data : array_like + Data to use in calculating the MLEs. + args : floats, optional + Starting value(s) for any shape-characterizing arguments (those not + provided will be determined by a call to ``_fitstart(data)``). + No default value. + kwds : floats, optional + Starting values for the location and scale parameters; no default. + Special keyword arguments are recognized as holding certain + parameters fixed: + + f0...fn : hold respective shape parameters fixed. + + floc : hold location parameter fixed to specified value. + + fscale : hold scale parameter fixed to specified value. + + optimizer : The optimizer to use. The optimizer must take func, + and starting position as the first two arguments, + plus args (for extra arguments to pass to the + function to be optimized) and disp=0 to suppress + output as keyword arguments. + + Returns + ------- + shape, loc, scale : tuple of floats + MLEs for any shape statistics, followed by those for location and + scale. + + Notes + ----- + This fit is computed by maximizing a log-likelihood function, with + penalty applied for samples outside of range of the distribution. The + returned answer is not guaranteed to be the globally optimal MLE, it + may only be locally optimal, or the optimization may fail altogether. + """ + Narg = len(args) + if Narg > self.numargs: + raise TypeError("Too many input arguments.") + + start = [None]*2 + if (Narg < self.numargs) or not ('loc' in kwds and + 'scale' in kwds): + # get distribution specific starting locations + start = self._fitstart(data) + args += start[Narg:-2] + loc = kwds.get('loc', start[-2]) + scale = kwds.get('scale', start[-1]) + args += (loc, scale) + x0, func, restore, args = self._reduce_func(args, kwds) + + optimizer = kwds.get('optimizer', optimize.fmin) + # convert string to function in scipy.optimize + if not callable(optimizer) and isinstance(optimizer, string_types): + if not optimizer.startswith('fmin_'): + optimizer = "fmin_"+optimizer + if optimizer == 'fmin_': + optimizer = 'fmin' + try: + optimizer = getattr(optimize, optimizer) + except AttributeError: + raise ValueError("%s is not a valid optimizer" % optimizer) + vals = optimizer(func, x0, args=(ravel(data),), disp=0) + if restore is not None: + vals = restore(args, vals) + vals = tuple(vals) + return vals + + def fit2(self, data, *args, **kwds): + ''' Return Maximum Likelihood or Maximum Product Spacing estimator object + + Parameters + ---------- + data : array-like + Data to use in calculating the ML or MPS estimators + args : optional + Starting values for any shape arguments (those not specified + will be determined by dist._fitstart(data)) + kwds : loc, scale + Starting values for the location and scale parameters + Special keyword arguments are recognized as holding certain + parameters fixed: + f0..fn : hold respective shape paramters fixed + floc : hold location parameter fixed to specified value + fscale : hold scale parameter fixed to specified value + method : of estimation. Options are + 'ml' : Maximum Likelihood method (default) + 'mps': Maximum Product Spacing method + alpha : scalar, optional + Confidence coefficent (default=0.05) + search : bool + If true search for best estimator (default), + otherwise return object with initial distribution parameters + copydata : bool + If true copydata (default) + optimizer : The optimizer to use. The optimizer must take func, + and starting position as the first two arguments, + plus args (for extra arguments to pass to the + function to be optimized) and disp=0 to suppress + output as keyword arguments. + + Return + ------ + phat : FitDistribution object + Fitted distribution object with following member variables: + LLmax : loglikelihood function evaluated using par + LPSmax : log product spacing function evaluated using par + pvalue : p-value for the fit + par : distribution parameters (fixed and fitted) + par_cov : covariance of distribution parameters + par_fix : fixed distribution parameters + par_lower : lower (1-alpha)% confidence bound for the parameters + par_upper : upper (1-alpha)% confidence bound for the parameters + + Note + ---- + `data` is sorted using this function, so if `copydata`==False the data + in your namespace will be sorted as well. + ''' + return FitDistribution(self, data, *args, **kwds) + + def fit_loc_scale(self, data, *args): + """ + Estimate loc and scale parameters from data using 1st and 2nd moments. + + Parameters + ---------- + data : array_like + Data to fit. + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + + Returns + ------- + Lhat : float + Estimated location parameter for the data. + Shat : float + Estimated scale parameter for the data. + + """ + mu, mu2 = self.stats(*args, **{'moments': 'mv'}) + tmp = asarray(data) + muhat = tmp.mean() + mu2hat = tmp.var() + Shat = sqrt(mu2hat / mu2) + Lhat = muhat - Shat*mu + if not np.isfinite(Lhat): + Lhat = 0 + if not (np.isfinite(Shat) and (0 < Shat)): + Shat = 1 + return Lhat, Shat + + @np.deprecate + def est_loc_scale(self, data, *args): + """This function is deprecated, use self.fit_loc_scale(data) instead. + """ + return self.fit_loc_scale(data, *args) + + def _entropy(self, *args): + def integ(x): + val = self._pdf(x, *args) + return xlogy(val, val) + + # upper limit is often inf, so suppress warnings when integrating + olderr = np.seterr(over='ignore') + entr = -integrate.quad(integ, self.a, self.b)[0] + np.seterr(**olderr) + + if not np.isnan(entr): + return entr + else: + # try with different limits if integration problems + low, upp = self.ppf([1e-10, 1. - 1e-10], *args) + if np.isinf(self.b): + upper = upp + else: + upper = self.b + if np.isinf(self.a): + lower = low + else: + lower = self.a + return -integrate.quad(integ, lower, upper)[0] + + def entropy(self, *args, **kwds): + """ + Differential entropy of the RV. + + Parameters + ---------- + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + loc : array_like, optional + Location parameter (default=0). + scale : array_like, optional + Scale parameter (default=1). + + """ + args, loc, scale = self._parse_args(*args, **kwds) + args = tuple(map(asarray, args)) + cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc) + output = zeros(shape(cond0), 'd') + place(output, (1-cond0), self.badvalue) + goodargs = argsreduce(cond0, *args) + # np.vectorize doesn't work when numargs == 0 in numpy 1.5.1 + if self.numargs == 0: + place(output, cond0, self._entropy() + log(scale)) + else: + place(output, cond0, self.vecentropy(*goodargs) + log(scale)) + + return output + + def expect(self, func=None, args=(), loc=0, scale=1, lb=None, ub=None, + conditional=False, **kwds): + """Calculate expected value of a function with respect to the + distribution. + + The expected value of a function ``f(x)`` with respect to a + distribution ``dist`` is defined as:: + + ubound + E[x] = Integral(f(x) * dist.pdf(x)) + lbound + + Parameters + ---------- + func : callable, optional + Function for which integral is calculated. Takes only one argument. + The default is the identity mapping f(x) = x. + args : tuple, optional + Argument (parameters) of the distribution. + lb, ub : scalar, optional + Lower and upper bound for integration. default is set to the + support of the distribution. + conditional : bool, optional + If True, the integral is corrected by the conditional probability + of the integration interval. The return value is the expectation + of the function, conditional on being in the given interval. + Default is False. + + Additional keyword arguments are passed to the integration routine. + + Returns + ------- + expect : float + The calculated expected value. + + Notes + ----- + The integration behavior of this function is inherited from + `integrate.quad`. + + """ + lockwds = {'loc': loc, + 'scale': scale} + self._argcheck(*args) + if func is None: + def fun(x, *args): + return x * self.pdf(x, *args, **lockwds) + else: + def fun(x, *args): + return func(x) * self.pdf(x, *args, **lockwds) + if lb is None: + lb = loc + self.a * scale + if ub is None: + ub = loc + self.b * scale + if conditional: + invfac = (self.sf(lb, *args, **lockwds) + - self.sf(ub, *args, **lockwds)) + else: + invfac = 1.0 + kwds['args'] = args + # Silence floating point warnings from integration. + olderr = np.seterr(all='ignore') + vals = integrate.quad(fun, lb, ub, **kwds)[0] / invfac + np.seterr(**olderr) + return vals + + +## Handlers for generic case where xk and pk are given +## The _drv prefix probably means discrete random variable. + +def _drv_pmf(self, xk, *args): + try: + return self.P[xk] + except KeyError: + return 0.0 + + +def _drv_cdf(self, xk, *args): + indx = argmax((self.xk > xk), axis=-1)-1 + return self.F[self.xk[indx]] + + +def _drv_ppf(self, q, *args): + indx = argmax((self.qvals >= q), axis=-1) + return self.Finv[self.qvals[indx]] + + +def _drv_nonzero(self, k, *args): + return 1 + + +def _drv_moment(self, n, *args): + n = asarray(n) + return sum(self.xk**n[newaxis, ...] * self.pk, axis=0) + + +def _drv_moment_gen(self, t, *args): + t = asarray(t) + return sum(exp(self.xk * t[newaxis, ...]) * self.pk, axis=0) + + +def _drv2_moment(self, n, *args): + """Non-central moment of discrete distribution.""" + # many changes, originally not even a return + tot = 0.0 + diff = 1e100 + # pos = self.a + pos = max(0.0, 1.0*self.a) + count = 0 + # handle cases with infinite support + ulimit = max(1000, (min(self.b, 1000) + max(self.a, -1000))/2.0) + llimit = min(-1000, (min(self.b, 1000) + max(self.a, -1000))/2.0) + + while (pos <= self.b) and ((pos <= ulimit) or + (diff > self.moment_tol)): + diff = np.power(pos, n) * self.pmf(pos, *args) + # use pmf because _pmf does not check support in randint and there + # might be problems ? with correct self.a, self.b at this stage + tot += diff + pos += self.inc + count += 1 + + if self.a < 0: # handle case when self.a = -inf + diff = 1e100 + pos = -self.inc + while (pos >= self.a) and ((pos >= llimit) or + (diff > self.moment_tol)): + diff = np.power(pos, n) * self.pmf(pos, *args) + # using pmf instead of _pmf, see above + tot += diff + pos -= self.inc + count += 1 + return tot + + +def _drv2_ppfsingle(self, q, *args): # Use basic bisection algorithm + b = self.b + a = self.a + if isinf(b): # Be sure ending point is > q + b = int(max(100*q, 10)) + while 1: + if b >= self.b: + qb = 1.0 + break + qb = self._cdf(b, *args) + if (qb < q): + b += 10 + else: + break + else: + qb = 1.0 + if isinf(a): # be sure starting point < q + a = int(min(-100*q, -10)) + while 1: + if a <= self.a: + qb = 0.0 + break + qa = self._cdf(a, *args) + if (qa > q): + a -= 10 + else: + break + else: + qa = self._cdf(a, *args) + + while 1: + if (qa == q): + return a + if (qb == q): + return b + if b <= a+1: + # testcase: return wrong number at lower index + # python -c "from scipy.stats import zipf;print zipf.ppf(0.01, 2)" wrong + # python -c "from scipy.stats import zipf;print zipf.ppf([0.01, 0.61, 0.77, 0.83], 2)" + # python -c "from scipy.stats import logser;print logser.ppf([0.1, 0.66, 0.86, 0.93], 0.6)" + if qa > q: + return a + else: + return b + c = int((a+b)/2.0) + qc = self._cdf(c, *args) + if (qc < q): + if a != c: + a = c + else: + raise RuntimeError('updating stopped, endless loop') + qa = qc + elif (qc > q): + if b != c: + b = c + else: + raise RuntimeError('updating stopped, endless loop') + qb = qc + else: + return c + + +def entropy(pk, qk=None, base=None): + """Calculate the entropy of a distribution for given probability values. + + If only probabilities `pk` are given, the entropy is calculated as + ``S = -sum(pk * log(pk), axis=0)``. + + If `qk` is not None, then compute a relative entropy (also known as + Kullback-Leibler divergence or Kullback-Leibler distance) + ``S = sum(pk * log(pk / qk), axis=0)``. + + This routine will normalize `pk` and `qk` if they don't sum to 1. + + Parameters + ---------- + pk : sequence + Defines the (discrete) distribution. ``pk[i]`` is the (possibly + unnormalized) probability of event ``i``. + qk : sequence, optional + Sequence against which the relative entropy is computed. Should be in + the same format as `pk`. + base : float, optional + The logarithmic base to use, defaults to ``e`` (natural logarithm). + + Returns + ------- + S : float + The calculated entropy. + + """ + pk = asarray(pk) + pk = 1.0*pk / sum(pk, axis=0) + if qk is None: + vec = xlogy(pk, pk) + else: + qk = asarray(qk) + if len(qk) != len(pk): + raise ValueError("qk and pk must have same length.") + qk = 1.0*qk / sum(qk, axis=0) + # If qk is zero anywhere, then unless pk is zero at those places + # too, the relative entropy is infinite. + mask = qk == 0.0 + qk[mask] = 1.0 # Avoid the divide-by-zero warning + quotient = pk / qk + vec = -xlogy(pk, quotient) + vec[mask & (pk != 0.0)] = -inf + vec[mask & (pk == 0.0)] = 0.0 + S = -sum(vec, axis=0) + if base is not None: + S /= log(base) + return S + + +# Must over-ride one of _pmf or _cdf or pass in +# x_k, p(x_k) lists in initialization + +class rv_discrete(rv_generic): + """ + A generic discrete random variable class meant for subclassing. + + `rv_discrete` is a base class to construct specific distribution classes + and instances from for discrete random variables. rv_discrete can be used + to construct an arbitrary distribution with defined by a list of support + points and the corresponding probabilities. + + Parameters + ---------- + a : float, optional + Lower bound of the support of the distribution, default: 0 + b : float, optional + Upper bound of the support of the distribution, default: plus infinity + moment_tol : float, optional + The tolerance for the generic calculation of moments + values : tuple of two array_like + (xk, pk) where xk are points (integers) with positive probability pk + with sum(pk) = 1 + inc : integer + increment for the support of the distribution, default: 1 + other values have not been tested + badvalue : object, optional + The value in (masked) arrays that indicates a value that should be + ignored. + name : str, optional + The name of the instance. This string is used to construct the default + example for distributions. + longname : str, optional + This string is used as part of the first line of the docstring returned + when a subclass has no docstring of its own. Note: `longname` exists + for backwards compatibility, do not use for new subclasses. + shapes : str, optional + The shape of the distribution. For example ``"m, n"`` for a + distribution that takes two integers as the first two arguments for all + its methods. + extradoc : str, optional + This string is used as the last part of the docstring returned when a + subclass has no docstring of its own. Note: `extradoc` exists for + backwards compatibility, do not use for new subclasses. + + Methods + ------- + generic.rvs(, loc=0, size=1) + random variates + + generic.pmf(x, , loc=0) + probability mass function + + logpmf(x, , loc=0) + log of the probability density function + + generic.cdf(x, , loc=0) + cumulative density function + + generic.logcdf(x, , loc=0) + log of the cumulative density function + + generic.sf(x, , loc=0) + survival function (1-cdf --- sometimes more accurate) + + generic.logsf(x, , loc=0, scale=1) + log of the survival function + + generic.ppf(q, , loc=0) + percent point function (inverse of cdf --- percentiles) + + generic.isf(q, , loc=0) + inverse survival function (inverse of sf) + + generic.moment(n, , loc=0) + non-central n-th moment of the distribution. May not work for array + arguments. + + generic.stats(, loc=0, moments='mv') + mean('m', axis=0), variance('v'), skew('s'), and/or kurtosis('k') + + generic.entropy(, loc=0) + entropy of the RV + + generic.expect(func=None, args=(), loc=0, lb=None, ub=None, + conditional=False) + Expected value of a function with respect to the distribution. + Additional kwd arguments passed to integrate.quad + + generic.median(, loc=0) + Median of the distribution. + + generic.mean(, loc=0) + Mean of the distribution. + + generic.std(, loc=0) + Standard deviation of the distribution. + + generic.var(, loc=0) + Variance of the distribution. + + generic.interval(alpha, , loc=0) + Interval that with `alpha` percent probability contains a random + realization of this distribution. + + generic(, loc=0) + calling a distribution instance returns a frozen distribution + + Notes + ----- + + You can construct an arbitrary discrete rv where ``P{X=xk} = pk`` + by passing to the rv_discrete initialization method (through the + values=keyword) a tuple of sequences (xk, pk) which describes only those + values of X (xk) that occur with nonzero probability (pk). + + To create a new discrete distribution, we would do the following:: + + class poisson_gen(rv_discrete): + #"Poisson distribution" + def _pmf(self, k, mu): + ... + + and create an instance:: + + poisson = poisson_gen(name="poisson", + longname='A Poisson') + + The docstring can be created from a template. + + Alternatively, the object may be called (as a function) to fix the shape + and location parameters returning a "frozen" discrete RV object:: + + myrv = generic(, loc=0) + - frozen RV object with the same methods but holding the given + shape and location fixed. + + A note on ``shapes``: subclasses need not specify them explicitly. In this + case, the `shapes` will be automatically deduced from the signatures of the + overridden methods. + If, for some reason, you prefer to avoid relying on introspection, you can + specify ``shapes`` explicitly as an argument to the instance constructor. + + + Examples + -------- + + Custom made discrete distribution: + + >>> import matplotlib.pyplot as plt + >>> from scipy import stats + >>> xk = np.arange(7) + >>> pk = (0.1, 0.2, 0.3, 0.1, 0.1, 0.1, 0.1) + >>> custm = stats.rv_discrete(name='custm', values=(xk, pk)) + >>> h = plt.plot(xk, custm.pmf(xk)) + + Random number generation: + + >>> R = custm.rvs(size=100) + + Display frozen pmf: + + >>> numargs = generic.numargs + >>> [ ] = ['Replace with resonable value', ]*numargs + >>> rv = generic() + >>> x = np.arange(0, np.min(rv.dist.b, 3)+1) + >>> h = plt.plot(x, rv.pmf(x)) + + Here, ``rv.dist.b`` is the right endpoint of the support of ``rv.dist``. + + Check accuracy of cdf and ppf: + + >>> prb = generic.cdf(x, ) + >>> h = plt.semilogy(np.abs(x-generic.ppf(prb, ))+1e-20) + + """ + + def __init__(self, a=0, b=inf, name=None, badvalue=None, + moment_tol=1e-8, values=None, inc=1, longname=None, + shapes=None, extradoc=None): + + super(rv_discrete, self).__init__() + + if badvalue is None: + badvalue = nan + if name is None: + name = 'Distribution' + self.badvalue = badvalue + self.a = a + self.b = b + self.name = name + self.moment_tol = moment_tol + self.inc = inc + self._cdfvec = vectorize(self._cdf_single, otypes='d') + self.return_integers = 1 + self.vecentropy = vectorize(self._entropy) + self.shapes = shapes + self.extradoc = extradoc + + if values is not None: + self.xk, self.pk = values + self.return_integers = 0 + indx = argsort(ravel(self.xk)) + self.xk = take(ravel(self.xk), indx, 0) + self.pk = take(ravel(self.pk), indx, 0) + self.a = self.xk[0] + self.b = self.xk[-1] + self.P = dict(zip(self.xk, self.pk)) + self.qvals = np.cumsum(self.pk, axis=0) + self.F = dict(zip(self.xk, self.qvals)) + decreasing_keys = sorted(self.F.keys(), reverse=True) + self.Finv = dict((self.F[k], k) for k in decreasing_keys) + self._ppf = instancemethod(vectorize(_drv_ppf, otypes='d'), + self, rv_discrete) + self._pmf = instancemethod(vectorize(_drv_pmf, otypes='d'), + self, rv_discrete) + self._cdf = instancemethod(vectorize(_drv_cdf, otypes='d'), + self, rv_discrete) + self._nonzero = instancemethod(_drv_nonzero, self, rv_discrete) + self.generic_moment = instancemethod(_drv_moment, + self, rv_discrete) + self.moment_gen = instancemethod(_drv_moment_gen, + self, rv_discrete) + self._construct_argparser(meths_to_inspect=[_drv_pmf], + locscale_in='loc=0', + # scale=1 for discrete RVs + locscale_out='loc, 1') + else: + self._construct_argparser(meths_to_inspect=[self._pmf, self._cdf], + locscale_in='loc=0', + # scale=1 for discrete RVs + locscale_out='loc, 1') + + # nin correction needs to be after we know numargs + # correct nin for generic moment vectorization + _vec_generic_moment = vectorize(_drv2_moment, otypes='d') + _vec_generic_moment.nin = self.numargs + 2 + self.generic_moment = instancemethod(_vec_generic_moment, + self, rv_discrete) + + # backwards compatibility + self.vec_generic_moment = _vec_generic_moment + + # correct nin for ppf vectorization + _vppf = vectorize(_drv2_ppfsingle, otypes='d') + _vppf.nin = self.numargs + 2 # +1 is for self + self._ppfvec = instancemethod(_vppf, + self, rv_discrete) + + # now that self.numargs is defined, we can adjust nin + self._cdfvec.nin = self.numargs + 1 + + # generate docstring for subclass instances + if longname is None: + if name[0] in ['aeiouAEIOU']: + hstr = "An " + else: + hstr = "A " + longname = hstr + name + + if sys.flags.optimize < 2: + # Skip adding docstrings if interpreter is run with -OO + if self.__doc__ is None: + self._construct_default_doc(longname=longname, + extradoc=extradoc) + else: + self._construct_doc() + + #discrete RV do not have the scale parameter, remove it + self.__doc__ = self.__doc__.replace( + '\n scale : array_like, ' + 'optional\n scale parameter (default=1)', '') + + def _construct_default_doc(self, longname=None, extradoc=None): + """Construct instance docstring from the rv_discrete template.""" + if extradoc is None: + extradoc = '' + if extradoc.startswith('\n\n'): + extradoc = extradoc[2:] + self.__doc__ = ''.join(['%s discrete random variable.' % longname, + '\n\n%(before_notes)s\n', docheaders['notes'], + extradoc, '\n%(example)s']) + self._construct_doc() + + def _construct_doc(self): + """Construct the instance docstring with string substitutions.""" + tempdict = docdict_discrete.copy() + tempdict['name'] = self.name or 'distname' + tempdict['shapes'] = self.shapes or '' + + if self.shapes is None: + # remove shapes from call parameters if there are none + for item in ['callparams', 'default', 'before_notes']: + tempdict[item] = tempdict[item].replace( + "\n%(shapes)s : array_like\n shape parameters", "") + for i in range(2): + if self.shapes is None: + # necessary because we use %(shapes)s in two forms (w w/o ", ") + self.__doc__ = self.__doc__.replace("%(shapes)s, ", "") + self.__doc__ = doccer.docformat(self.__doc__, tempdict) + + def _nonzero(self, k, *args): + return floor(k) == k + + def _pmf(self, k, *args): + return self._cdf(k, *args) - self._cdf(k-1, *args) + + def _logpmf(self, k, *args): + return log(self._pmf(k, *args)) + + def _cdf_single(self, k, *args): + m = arange(int(self.a), k+1) + return sum(self._pmf(m, *args), axis=0) + + def _cdf(self, x, *args): + k = floor(x) + return self._cdfvec(k, *args) + + # generic _logcdf, _sf, _logsf, _ppf, _isf, _rvs defined in rv_generic + + def rvs(self, *args, **kwargs): + """ + Random variates of given type. + + Parameters + ---------- + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + loc : array_like, optional + Location parameter (default=0). + size : int or tuple of ints, optional + Defining number of random variates (default=1). Note that `size` + has to be given as keyword, not as positional argument. + + Returns + ------- + rvs : ndarray or scalar + Random variates of given `size`. + + """ + kwargs['discrete'] = True + return super(rv_discrete, self).rvs(*args, **kwargs) + + def pmf(self, k, *args, **kwds): + """ + Probability mass function at k of the given RV. + + Parameters + ---------- + k : array_like + quantiles + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information) + loc : array_like, optional + Location parameter (default=0). + + Returns + ------- + pmf : array_like + Probability mass function evaluated at k + + """ + args, loc, _ = self._parse_args(*args, **kwds) + k, loc = map(asarray, (k, loc)) + args = tuple(map(asarray, args)) + k = asarray((k-loc)) + cond0 = self._argcheck(*args) + cond1 = (k >= self.a) & (k <= self.b) & self._nonzero(k, *args) + cond = cond0 & cond1 + output = zeros(shape(cond), 'd') + place(output, (1-cond0) + np.isnan(k), self.badvalue) + if any(cond): + goodargs = argsreduce(cond, *((k,)+args)) + place(output, cond, self._pmf(*goodargs)) + if output.ndim == 0: + return output[()] + return output + + def logpmf(self, k, *args, **kwds): + """ + Log of the probability mass function at k of the given RV. + + Parameters + ---------- + k : array_like + Quantiles. + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + loc : array_like, optional + Location parameter. Default is 0. + + Returns + ------- + logpmf : array_like + Log of the probability mass function evaluated at k. + + """ + args, loc, _ = self._parse_args(*args, **kwds) + k, loc = map(asarray, (k, loc)) + args = tuple(map(asarray, args)) + k = asarray((k-loc)) + cond0 = self._argcheck(*args) + cond1 = (k >= self.a) & (k <= self.b) & self._nonzero(k, *args) + cond = cond0 & cond1 + output = empty(shape(cond), 'd') + output.fill(NINF) + place(output, (1-cond0) + np.isnan(k), self.badvalue) + if any(cond): + goodargs = argsreduce(cond, *((k,)+args)) + place(output, cond, self._logpmf(*goodargs)) + if output.ndim == 0: + return output[()] + return output + + def cdf(self, k, *args, **kwds): + """ + Cumulative distribution function of the given RV. + + Parameters + ---------- + k : array_like, int + Quantiles. + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + loc : array_like, optional + Location parameter (default=0). + + Returns + ------- + cdf : ndarray + Cumulative distribution function evaluated at `k`. + + """ + args, loc, _ = self._parse_args(*args, **kwds) + k, loc = map(asarray, (k, loc)) + args = tuple(map(asarray, args)) + k = asarray((k-loc)) + cond0 = self._argcheck(*args) + cond1 = (k >= self.a) & (k < self.b) + cond2 = (k >= self.b) + cond = cond0 & cond1 + output = zeros(shape(cond), 'd') + place(output, (1-cond0) + np.isnan(k), self.badvalue) + place(output, cond2*(cond0 == cond0), 1.0) + + if any(cond): + goodargs = argsreduce(cond, *((k,)+args)) + place(output, cond, self._cdf(*goodargs)) + if output.ndim == 0: + return output[()] + return output + + def logcdf(self, k, *args, **kwds): + """ + Log of the cumulative distribution function at k of the given RV + + Parameters + ---------- + k : array_like, int + Quantiles. + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + loc : array_like, optional + Location parameter (default=0). + + Returns + ------- + logcdf : array_like + Log of the cumulative distribution function evaluated at k. + + """ + args, loc, _ = self._parse_args(*args, **kwds) + k, loc = map(asarray, (k, loc)) + args = tuple(map(asarray, args)) + k = asarray((k-loc)) + cond0 = self._argcheck(*args) + cond1 = (k >= self.a) & (k < self.b) + cond2 = (k >= self.b) + cond = cond0 & cond1 + output = empty(shape(cond), 'd') + output.fill(NINF) + place(output, (1-cond0) + np.isnan(k), self.badvalue) + place(output, cond2*(cond0 == cond0), 0.0) + + if any(cond): + goodargs = argsreduce(cond, *((k,)+args)) + place(output, cond, self._logcdf(*goodargs)) + if output.ndim == 0: + return output[()] + return output + + def sf(self, k, *args, **kwds): + """ + Survival function (1-cdf) at k of the given RV. + + Parameters + ---------- + k : array_like + Quantiles. + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + loc : array_like, optional + Location parameter (default=0). + + Returns + ------- + sf : array_like + Survival function evaluated at k. + + """ + args, loc, _ = self._parse_args(*args, **kwds) + k, loc = map(asarray, (k, loc)) + args = tuple(map(asarray, args)) + k = asarray(k-loc) + cond0 = self._argcheck(*args) + cond1 = (k >= self.a) & (k <= self.b) + cond2 = (k < self.a) & cond0 + cond = cond0 & cond1 + output = zeros(shape(cond), 'd') + place(output, (1-cond0) + np.isnan(k), self.badvalue) + place(output, cond2, 1.0) + if any(cond): + goodargs = argsreduce(cond, *((k,)+args)) + place(output, cond, self._sf(*goodargs)) + if output.ndim == 0: + return output[()] + return output + + def logsf(self, k, *args, **kwds): + """ + Log of the survival function of the given RV. + + Returns the log of the "survival function," defined as ``1 - cdf``, + evaluated at `k`. + + Parameters + ---------- + k : array_like + Quantiles. + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + loc : array_like, optional + Location parameter (default=0). + + Returns + ------- + logsf : ndarray + Log of the survival function evaluated at `k`. + + """ + args, loc, _ = self._parse_args(*args, **kwds) + k, loc = map(asarray, (k, loc)) + args = tuple(map(asarray, args)) + k = asarray(k-loc) + cond0 = self._argcheck(*args) + cond1 = (k >= self.a) & (k <= self.b) + cond2 = (k < self.a) & cond0 + cond = cond0 & cond1 + output = empty(shape(cond), 'd') + output.fill(NINF) + place(output, (1-cond0) + np.isnan(k), self.badvalue) + place(output, cond2, 0.0) + if any(cond): + goodargs = argsreduce(cond, *((k,)+args)) + place(output, cond, self._logsf(*goodargs)) + if output.ndim == 0: + return output[()] + return output + + def ppf(self, q, *args, **kwds): + """ + Percent point function (inverse of cdf) at q of the given RV + + Parameters + ---------- + q : array_like + Lower tail probability. + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + loc : array_like, optional + Location parameter (default=0). + scale : array_like, optional + Scale parameter (default=1). + + Returns + ------- + k : array_like + Quantile corresponding to the lower tail probability, q. + + """ + args, loc, _ = self._parse_args(*args, **kwds) + q, loc = map(asarray, (q, loc)) + args = tuple(map(asarray, args)) + cond0 = self._argcheck(*args) & (loc == loc) + cond1 = (q > 0) & (q < 1) + cond2 = (q == 1) & cond0 + cond = cond0 & cond1 + output = valarray(shape(cond), value=self.badvalue, typecode='d') + # output type 'd' to handle nin and inf + place(output, (q == 0)*(cond == cond), self.a-1) + place(output, cond2, self.b) + if any(cond): + goodargs = argsreduce(cond, *((q,)+args+(loc,))) + loc, goodargs = goodargs[-1], goodargs[:-1] + place(output, cond, self._ppf(*goodargs) + loc) + + if output.ndim == 0: + return output[()] + return output + + def isf(self, q, *args, **kwds): + """ + Inverse survival function (1-sf) at q of the given RV. + + Parameters + ---------- + q : array_like + Upper tail probability. + arg1, arg2, arg3,... : array_like + The shape parameter(s) for the distribution (see docstring of the + instance object for more information). + loc : array_like, optional + Location parameter (default=0). + + Returns + ------- + k : ndarray or scalar + Quantile corresponding to the upper tail probability, q. + + """ + args, loc, _ = self._parse_args(*args, **kwds) + q, loc = map(asarray, (q, loc)) + args = tuple(map(asarray, args)) + cond0 = self._argcheck(*args) & (loc == loc) + cond1 = (q > 0) & (q < 1) + cond2 = (q == 1) & cond0 + cond = cond0 & cond1 + + # same problem as with ppf; copied from ppf and changed + output = valarray(shape(cond), value=self.badvalue, typecode='d') + # output type 'd' to handle nin and inf + place(output, (q == 0)*(cond == cond), self.b) + place(output, cond2, self.a-1) + + # call place only if at least 1 valid argument + if any(cond): + goodargs = argsreduce(cond, *((q,)+args+(loc,))) + loc, goodargs = goodargs[-1], goodargs[:-1] + # PB same as ticket 766 + place(output, cond, self._isf(*goodargs) + loc) + + if output.ndim == 0: + return output[()] + return output + + def _entropy(self, *args): + if hasattr(self, 'pk'): + return entropy(self.pk) + else: + mu = int(self.stats(*args, **{'moments': 'm'})) + val = self.pmf(mu, *args) + ent = -xlogy(val, val) + k = 1 + term = 1.0 + while (abs(term) > _EPS): + val = self.pmf(mu+k, *args) + term = -xlogy(val, val) + val = self.pmf(mu-k, *args) + term -= xlogy(val, val) + k += 1 + ent += term + return ent + + def expect(self, func=None, args=(), loc=0, lb=None, ub=None, + conditional=False): + """ + Calculate expected value of a function with respect to the distribution + for discrete distribution + + Parameters + ---------- + fn : function (default: identity mapping) + Function for which sum is calculated. Takes only one argument. + args : tuple + argument (parameters) of the distribution + lb, ub : numbers, optional + lower and upper bound for integration, default is set to the + support of the distribution, lb and ub are inclusive (ul<=k<=ub) + conditional : bool, optional + Default is False. + If true then the expectation is corrected by the conditional + probability of the integration interval. The return value is the + expectation of the function, conditional on being in the given + interval (k such that ul<=k<=ub). + + Returns + ------- + expect : float + Expected value. + + Notes + ----- + * function is not vectorized + * accuracy: uses self.moment_tol as stopping criterium + for heavy tailed distribution e.g. zipf(4), accuracy for + mean, variance in example is only 1e-5, + increasing precision (moment_tol) makes zipf very slow + * suppnmin=100 internal parameter for minimum number of points to + evaluate could be added as keyword parameter, to evaluate functions + with non-monotonic shapes, points include integers in (-suppnmin, + suppnmin) + * uses maxcount=1000 limits the number of points that are evaluated + to break loop for infinite sums + (a maximum of suppnmin+1000 positive plus suppnmin+1000 negative + integers are evaluated) + + """ + + # moment_tol = 1e-12 # increase compared to self.moment_tol, + # too slow for only small gain in precision for zipf + + # avoid endless loop with unbound integral, eg. var of zipf(2) + maxcount = 1000 + suppnmin = 100 # minimum number of points to evaluate (+ and -) + + if func is None: + def fun(x): + # loc and args from outer scope + return (x+loc)*self._pmf(x, *args) + else: + def fun(x): + # loc and args from outer scope + return func(x+loc)*self._pmf(x, *args) + # used pmf because _pmf does not check support in randint and there + # might be problems(?) with correct self.a, self.b at this stage maybe + # not anymore, seems to work now with _pmf + + self._argcheck(*args) # (re)generate scalar self.a and self.b + if lb is None: + lb = (self.a) + else: + lb = lb - loc # convert bound for standardized distribution + if ub is None: + ub = (self.b) + else: + ub = ub - loc # convert bound for standardized distribution + if conditional: + if np.isposinf(ub)[()]: + # work around bug: stats.poisson.sf(stats.poisson.b, 2) is nan + invfac = 1 - self.cdf(lb-1, *args) + else: + invfac = 1 - self.cdf(lb-1, *args) - self.sf(ub, *args) + else: + invfac = 1.0 + + tot = 0.0 + low, upp = self._ppf(0.001, *args), self._ppf(0.999, *args) + low = max(min(-suppnmin, low), lb) + upp = min(max(suppnmin, upp), ub) + supp = np.arange(low, upp+1, self.inc) # check limits + tot = np.sum(fun(supp)) + diff = 1e100 + pos = upp + self.inc + count = 0 + + # handle cases with infinite support + + while (pos <= ub) and (diff > self.moment_tol) and count <= maxcount: + diff = fun(pos) + tot += diff + pos += self.inc + count += 1 + + if self.a < 0: # handle case when self.a = -inf + diff = 1e100 + pos = low - self.inc + while ((pos >= lb) and (diff > self.moment_tol) and + count <= maxcount): + diff = fun(pos) + tot += diff + pos -= self.inc + count += 1 + if count > maxcount: + warnings.warn('expect(): sum did not converge', RuntimeWarning) + return tot/invfac diff --git a/pywafo/src/wafo/stats/_multivariate.py b/pywafo/src/wafo/stats/_multivariate.py new file mode 100644 index 0000000..83f0381 --- /dev/null +++ b/pywafo/src/wafo/stats/_multivariate.py @@ -0,0 +1,493 @@ +# +# Author: Joris Vankerschaver 2013 +# +from __future__ import division, print_function, absolute_import + +from scipy.misc import doccer +from functools import wraps +import numpy as np +import scipy.linalg + +__all__ = ['multivariate_normal'] + + +_LOG_2PI = np.log(2 * np.pi) + + +def _process_parameters(dim, mean, cov): + """ + Infer dimensionality from mean or covariance matrix, ensure that + mean and covariance are full vector resp. matrix. + + """ + + # Try to infer dimensionality + if dim is None: + if mean is None: + if cov is None: + dim = 1 + else: + cov = np.asarray(cov, dtype=float) + if cov.ndim < 2: + dim = 1 + else: + dim = cov.shape[0] + else: + mean = np.asarray(mean, dtype=float) + dim = mean.size + else: + if not np.isscalar(dim): + raise ValueError("Dimension of random variable must be a scalar.") + + # Check input sizes and return full arrays for mean and cov if necessary + if mean is None: + mean = np.zeros(dim) + mean = np.asarray(mean, dtype=float) + + if cov is None: + cov = 1.0 + cov = np.asarray(cov, dtype=float) + + if dim == 1: + mean.shape = (1,) + cov.shape = (1, 1) + + if mean.ndim != 1 or mean.shape[0] != dim: + raise ValueError("Array 'mean' must be vector of length %d." % dim) + if cov.ndim == 0: + cov = cov * np.eye(dim) + elif cov.ndim == 1: + cov = np.diag(cov) + else: + if cov.shape != (dim, dim): + raise ValueError("Array 'cov' must be at most two-dimensional," + " but cov.ndim = %d" % cov.ndim) + + return dim, mean, cov + + +def _process_quantiles(x, dim): + """ + Adjust quantiles array so that last axis labels the components of + each data point. + + """ + x = np.asarray(x, dtype=float) + + if x.ndim == 0: + x = x[np.newaxis] + elif x.ndim == 1: + if dim == 1: + x = x[:, np.newaxis] + else: + x = x[np.newaxis, :] + + return x + + +def _squeeze_output(out): + """ + Remove single-dimensional entries from array and convert to scalar, + if necessary. + + """ + out = out.squeeze() + if out.ndim == 0: + out = out[()] + return out + + +def _pinv_1d(v, eps=1e-5): + """ + A helper function for computing the pseudoinverse. + + Parameters + ---------- + v : iterable of numbers + This may be thought of as a vector of eigenvalues or singular values. + eps : float + Elements of v smaller than eps are considered negligible. + + Returns + ------- + v_pinv : 1d float ndarray + A vector of pseudo-inverted numbers. + + """ + return np.array([0 if abs(x) < eps else 1/x for x in v], dtype=float) + + +def _psd_pinv_decomposed_log_pdet(mat, cond=None, rcond=None, + lower=True, check_finite=True): + """ + Compute a decomposition of the pseudo-inverse and the logarithm of + the pseudo-determinant of a symmetric positive semi-definite + matrix. + + The pseudo-determinant of a matrix is defined as the product of + the non-zero eigenvalues, and coincides with the usual determinant + for a full matrix. + + Parameters + ---------- + mat : array_like + Input array of shape (`m`, `n`) + cond, rcond : float or None + Cutoff for 'small' singular values. + Eigenvalues smaller than ``rcond*largest_eigenvalue`` + are considered zero. + If None or -1, suitable machine precision is used. + lower : bool, optional + Whether the pertinent array data is taken from the lower or upper + triangle of `mat`. (Default: lower) + check_finite : boolean, optional + Whether to check that the input matrix contains only finite numbers. + Disabling may give a performance gain, but may result in problems + (crashes, non-termination) if the inputs do contain infinities or NaNs. + + Returns + ------- + M : array_like + The pseudo-inverse of the input matrix is np.dot(M, M.T). + log_pdet : float + Logarithm of the pseudo-determinant of the matrix. + + """ + # Compute the symmetric eigendecomposition. + # The input covariance matrix is required to be real symmetric + # and positive semidefinite which implies that its eigenvalues + # are all real and non-negative, + # but clip them anyway to avoid numerical issues. + + # TODO: the code to set cond/rcond is identical to that in + # scipy.linalg.{pinvh, pinv2} and if/when this function is subsumed + # into scipy.linalg it should probably be shared between all of + # these routines. + + # Note that eigh takes care of array conversion, chkfinite, + # and assertion that the matrix is square. + s, u = scipy.linalg.eigh(mat, lower=lower, check_finite=check_finite) + + if rcond is not None: + cond = rcond + if cond in [None, -1]: + t = u.dtype.char.lower() + factor = {'f': 1E3, 'd': 1E6} + cond = factor[t] * np.finfo(t).eps + eps = cond * np.max(abs(s)) + + if np.min(s) < -eps: + raise ValueError('the covariance matrix must be positive semidefinite') + + s_pinv = _pinv_1d(s, eps) + U = np.multiply(u, np.sqrt(s_pinv)) + log_pdet = np.sum(np.log(s[s > eps])) + + return U, log_pdet + + +_doc_default_callparams = \ +"""mean : array_like, optional + Mean of the distribution (default zero) +cov : array_like, optional + Covariance matrix of the distribution (default one) +""" + +_doc_callparams_note = \ +"""Setting the parameter `mean` to `None` is equivalent to having `mean` +be the zero-vector. The parameter `cov` can be a scalar, in which case +the covariance matrix is the identity times that value, a vector of +diagonal entries for the covariance matrix, or a two-dimensional +array_like. +""" + +_doc_frozen_callparams = "" + +_doc_frozen_callparams_note = \ +"""See class definition for a detailed description of parameters.""" + +docdict_params = { + '_doc_default_callparams': _doc_default_callparams, + '_doc_callparams_note': _doc_callparams_note +} + +docdict_noparams = { + '_doc_default_callparams': _doc_frozen_callparams, + '_doc_callparams_note': _doc_frozen_callparams_note +} + + +class multivariate_normal_gen(object): + r""" + A multivariate normal random variable. + + The `mean` keyword specifies the mean. The `cov` keyword specifies the + covariance matrix. + + .. versionadded:: 0.14.0 + + Methods + ------- + pdf(x, mean=None, cov=1) + Probability density function. + logpdf(x, mean=None, cov=1) + Log of the probability density function. + rvs(mean=None, cov=1) + Draw random samples from a multivariate normal distribution. + entropy() + Compute the differential entropy of the multivariate normal. + + Parameters + ---------- + x : array_like + Quantiles, with the last axis of `x` denoting the components. + %(_doc_default_callparams)s + + Alternatively, the object may be called (as a function) to fix the mean + and covariance parameters, returning a "frozen" multivariate normal + random variable: + + rv = multivariate_normal(mean=None, scale=1) + - Frozen object with the same methods but holding the given + mean and covariance fixed. + + Notes + ----- + %(_doc_callparams_note)s + + The covariance matrix `cov` must be a (symmetric) positive + semi-definite matrix. The determinant and inverse of `cov` are computed + as the pseudo-determinant and pseudo-inverse, respectively, so + that `cov` does not need to have full rank. + + The probability density function for `multivariate_normal` is + + .. math:: + + f(x) = \frac{1}{\sqrt{(2 \pi)^k \det \Sigma}} \exp\left( -\frac{1}{2} (x - \mu)^T \Sigma^{-1} (x - \mu) \right), + + where :math:`\mu` is the mean, :math:`\Sigma` the covariance matrix, + and :math:`k` is the dimension of the space where :math:`x` takes values. + + Examples + -------- + >>> from scipy.stats import multivariate_normal + >>> x = np.linspace(0, 5, 10, endpoint=False) + >>> y = multivariate_normal.pdf(x, mean=2.5, cov=0.5); y + array([ 0.00108914, 0.01033349, 0.05946514, 0.20755375, 0.43939129, + 0.56418958, 0.43939129, 0.20755375, 0.05946514, 0.01033349]) + >>> plt.plot(x, y) + + The input quantiles can be any shape of array, as long as the last + axis labels the components. This allows us for instance to + display the frozen pdf for a non-isotropic random variable in 2D as + follows: + + >>> x, y = np.mgrid[-1:1:.01, -1:1:.01] + >>> pos = np.empty(x.shape + (2,)) + >>> pos[:, :, 0] = x; pos[:, :, 1] = y + >>> rv = multivariate_normal([0.5, -0.2], [[2.0, 0.3], [0.3, 0.5]]) + >>> plt.contourf(x, y, rv.pdf(pos)) + + """ + + def __init__(self): + self.__doc__ = doccer.docformat(self.__doc__, docdict_params) + + def __call__(self, mean=None, cov=1): + """ + Create a frozen multivariate normal distribution. + + See `multivariate_normal_frozen` for more information. + + """ + return multivariate_normal_frozen(mean, cov) + + def _logpdf(self, x, mean, prec_U, log_det_cov): + """ + Parameters + ---------- + x : ndarray + Points at which to evaluate the log of the probability + density function + mean : ndarray + Mean of the distribution + prec_U : ndarray + A decomposition such that np.dot(prec_U, prec_U.T) + is the precision matrix, i.e. inverse of the covariance matrix. + log_det_cov : float + Logarithm of the determinant of the covariance matrix + + Notes + ----- + As this function does no argument checking, it should not be + called directly; use 'logpdf' instead. + + """ + dim = x.shape[-1] + dev = x - mean + maha = np.sum(np.square(np.dot(dev, prec_U)), axis=-1) + return -0.5 * (dim * _LOG_2PI + log_det_cov + maha) + + def logpdf(self, x, mean, cov): + """ + Log of the multivariate normal probability density function. + + Parameters + ---------- + x : array_like + Quantiles, with the last axis of `x` denoting the components. + %(_doc_default_callparams)s + + Notes + ----- + %(_doc_callparams_note)s + + Returns + ------- + pdf : ndarray + Log of the probability density function evaluated at `x` + + """ + dim, mean, cov = _process_parameters(None, mean, cov) + x = _process_quantiles(x, dim) + prec_U, log_det_cov = _psd_pinv_decomposed_log_pdet(cov) + out = self._logpdf(x, mean, prec_U, log_det_cov) + return _squeeze_output(out) + + def pdf(self, x, mean, cov): + """ + Multivariate normal probability density function. + + Parameters + ---------- + x : array_like + Quantiles, with the last axis of `x` denoting the components. + %(_doc_default_callparams)s + + Notes + ----- + %(_doc_callparams_note)s + + Returns + ------- + pdf : ndarray + Probability density function evaluated at `x` + + """ + dim, mean, cov = _process_parameters(None, mean, cov) + x = _process_quantiles(x, dim) + prec_U, log_det_cov = _psd_pinv_decomposed_log_pdet(cov) + out = np.exp(self._logpdf(x, mean, prec_U, log_det_cov)) + return _squeeze_output(out) + + def rvs(self, mean=None, cov=1, size=1): + """ + Draw random samples from a multivariate normal distribution. + + Parameters + ---------- + %(_doc_default_callparams)s + size : integer, optional + Number of samples to draw (default 1). + + Notes + ----- + %(_doc_callparams_note)s + + Returns + ------- + rvs : ndarray or scalar + Random variates of size (`size`, `N`), where `N` is the + dimension of the random variable. + + """ + dim, mean, cov = _process_parameters(None, mean, cov) + out = np.random.multivariate_normal(mean, cov, size) + return _squeeze_output(out) + + def entropy(self, mean=None, cov=1): + """ + Compute the differential entropy of the multivariate normal. + + Parameters + ---------- + %(_doc_default_callparams)s + + Notes + ----- + %(_doc_callparams_note)s + + Returns + ------- + h : scalar + Entropy of the multivariate normal distribution + + """ + dim, mean, cov = _process_parameters(None, mean, cov) + return 1/2 * np.log(np.linalg.det(2 * np.pi * np.e * cov)) + +multivariate_normal = multivariate_normal_gen() + + +class multivariate_normal_frozen(object): + def __init__(self, mean=None, cov=1): + """ + Create a frozen multivariate normal distribution. + + Parameters + ---------- + mean : array_like, optional + Mean of the distribution (default zero) + cov : array_like, optional + Covariance matrix of the distribution (default one) + + Examples + -------- + When called with the default parameters, this will create a 1D random + variable with mean 0 and covariance 1: + + >>> from scipy.stats import multivariate_normal + >>> r = multivariate_normal() + >>> r.mean + array([ 0.]) + >>> r.cov + array([[1.]]) + + """ + self.dim, self.mean, self.cov = _process_parameters(None, mean, cov) + self.prec_U, self._log_det_cov = _psd_pinv_decomposed_log_pdet(self.cov) + + self._mnorm = multivariate_normal_gen() + + def logpdf(self, x): + x = _process_quantiles(x, self.dim) + out = self._mnorm._logpdf(x, self.mean, self.prec_U, self._log_det_cov) + return _squeeze_output(out) + + def pdf(self, x): + return np.exp(self.logpdf(x)) + + def rvs(self, size=1): + return self._mnorm.rvs(self.mean, self.cov, size) + + def entropy(self): + """ + Computes the differential entropy of the multivariate normal. + + Returns + ------- + h : scalar + Entropy of the multivariate normal distribution + + """ + return 1/2 * (self.dim * (_LOG_2PI + 1) + self._log_det_cov) + + +# Set frozen generator docstrings from corresponding docstrings in +# multivariate_normal_gen and fill in default strings in class docstrings +for name in ['logpdf', 'pdf', 'rvs']: + method = multivariate_normal_gen.__dict__[name] + method_frozen = multivariate_normal_frozen.__dict__[name] + method_frozen.__doc__ = doccer.docformat(method.__doc__, docdict_noparams) + method.__doc__ = doccer.docformat(method.__doc__, docdict_params) diff --git a/pywafo/src/wafo/stats/_tukeylambda_stats.py b/pywafo/src/wafo/stats/_tukeylambda_stats.py new file mode 100644 index 0000000..2681814 --- /dev/null +++ b/pywafo/src/wafo/stats/_tukeylambda_stats.py @@ -0,0 +1,201 @@ +from __future__ import division, print_function, absolute_import + +import numpy as np +from numpy import poly1d +from scipy.special import beta + + +# The following code was used to generate the Pade coefficients for the +# Tukey Lambda variance function. Version 0.17 of mpmath was used. +#--------------------------------------------------------------------------- +# import mpmath as mp +# +# mp.mp.dps = 60 +# +# one = mp.mpf(1) +# two = mp.mpf(2) +# +# def mpvar(lam): +# if lam == 0: +# v = mp.pi**2 / three +# else: +# v = (two / lam**2) * (one / (one + two*lam) - +# mp.beta(lam + one, lam + one)) +# return v +# +# t = mp.taylor(mpvar, 0, 8) +# p, q = mp.pade(t, 4, 4) +# print "p =", [mp.fp.mpf(c) for c in p] +# print "q =", [mp.fp.mpf(c) for c in q] +#--------------------------------------------------------------------------- + +# Pade coefficients for the Tukey Lambda variance function. +_tukeylambda_var_pc = [3.289868133696453, 0.7306125098871127, + -0.5370742306855439, 0.17292046290190008, + -0.02371146284628187] +_tukeylambda_var_qc = [1.0, 3.683605511659861, 4.184152498888124, + 1.7660926747377275, 0.2643989311168465] + +# numpy.poly1d instances for the numerator and denominator of the +# Pade approximation to the Tukey Lambda variance. +_tukeylambda_var_p = poly1d(_tukeylambda_var_pc[::-1]) +_tukeylambda_var_q = poly1d(_tukeylambda_var_qc[::-1]) + + +def tukeylambda_variance(lam): + """Variance of the Tukey Lambda distribution. + + Parameters + ---------- + lam : array_like + The lambda values at which to compute the variance. + + Returns + ------- + v : ndarray + The variance. For lam < -0.5, the variance is not defined, so + np.nan is returned. For lam = 0.5, np.inf is returned. + + Notes + ----- + In an interval around lambda=0, this function uses the [4,4] Pade + approximation to compute the variance. Otherwise it uses the standard + formula (http://en.wikipedia.org/wiki/Tukey_lambda_distribution). The + Pade approximation is used because the standard formula has a removable + discontinuity at lambda = 0, and does not produce accurate numerical + results near lambda = 0. + """ + lam = np.asarray(lam) + shp = lam.shape + lam = np.atleast_1d(lam).astype(np.float64) + + # For absolute values of lam less than threshold, use the Pade + # approximation. + threshold = 0.075 + + # Play games with masks to implement the conditional evaluation of + # the distribution. + # lambda < -0.5: var = nan + low_mask = lam < -0.5 + # lambda == -0.5: var = inf + neghalf_mask = lam == -0.5 + # abs(lambda) < threshold: use Pade approximation + small_mask = np.abs(lam) < threshold + # else the "regular" case: use the explicit formula. + reg_mask = ~(low_mask | neghalf_mask | small_mask) + + # Get the 'lam' values for the cases where they are needed. + small = lam[small_mask] + reg = lam[reg_mask] + + # Compute the function for each case. + v = np.empty_like(lam) + v[low_mask] = np.nan + v[neghalf_mask] = np.inf + if small.size > 0: + # Use the Pade approximation near lambda = 0. + v[small_mask] = _tukeylambda_var_p(small) / _tukeylambda_var_q(small) + if reg.size > 0: + v[reg_mask] = (2.0 / reg**2) * (1.0 / (1.0 + 2 * reg) - + beta(reg + 1, reg + 1)) + v.shape = shp + return v + + +# The following code was used to generate the Pade coefficients for the +# Tukey Lambda kurtosis function. Version 0.17 of mpmath was used. +#--------------------------------------------------------------------------- +# import mpmath as mp +# +# mp.mp.dps = 60 +# +# one = mp.mpf(1) +# two = mp.mpf(2) +# three = mp.mpf(3) +# four = mp.mpf(4) +# +# def mpkurt(lam): +# if lam == 0: +# k = mp.mpf(6)/5 +# else: +# numer = (one/(four*lam+one) - four*mp.beta(three*lam+one, lam+one) + +# three*mp.beta(two*lam+one, two*lam+one)) +# denom = two*(one/(two*lam+one) - mp.beta(lam+one,lam+one))**2 +# k = numer / denom - three +# return k +# +# # There is a bug in mpmath 0.17: when we use the 'method' keyword of the +# # taylor function and we request a degree 9 Taylor polynomial, we actually +# # get degree 8. +# t = mp.taylor(mpkurt, 0, 9, method='quad', radius=0.01) +# t = [mp.chop(c, tol=1e-15) for c in t] +# p, q = mp.pade(t, 4, 4) +# print "p =", [mp.fp.mpf(c) for c in p] +# print "q =", [mp.fp.mpf(c) for c in q] +#--------------------------------------------------------------------------- + +# Pade coefficients for the Tukey Lambda kurtosis function. +_tukeylambda_kurt_pc = [1.2, -5.853465139719495, -22.653447381131077, + 0.20601184383406815, 4.59796302262789] +_tukeylambda_kurt_qc = [1.0, 7.171149192233599, 12.96663094361842, + 0.43075235247853005, -2.789746758009912] + +# numpy.poly1d instances for the numerator and denominator of the +# Pade approximation to the Tukey Lambda kurtosis. +_tukeylambda_kurt_p = poly1d(_tukeylambda_kurt_pc[::-1]) +_tukeylambda_kurt_q = poly1d(_tukeylambda_kurt_qc[::-1]) + + +def tukeylambda_kurtosis(lam): + """Kurtosis of the Tukey Lambda distribution. + + Parameters + ---------- + lam : array_like + The lambda values at which to compute the variance. + + Returns + ------- + v : ndarray + The variance. For lam < -0.25, the variance is not defined, so + np.nan is returned. For lam = 0.25, np.inf is returned. + + """ + lam = np.asarray(lam) + shp = lam.shape + lam = np.atleast_1d(lam).astype(np.float64) + + # For absolute values of lam less than threshold, use the Pade + # approximation. + threshold = 0.055 + + # Use masks to implement the conditional evaluation of the kurtosis. + # lambda < -0.25: kurtosis = nan + low_mask = lam < -0.25 + # lambda == -0.25: kurtosis = inf + negqrtr_mask = lam == -0.25 + # lambda near 0: use Pade approximation + small_mask = np.abs(lam) < threshold + # else the "regular" case: use the explicit formula. + reg_mask = ~(low_mask | negqrtr_mask | small_mask) + + # Get the 'lam' values for the cases where they are needed. + small = lam[small_mask] + reg = lam[reg_mask] + + # Compute the function for each case. + k = np.empty_like(lam) + k[low_mask] = np.nan + k[negqrtr_mask] = np.inf + if small.size > 0: + k[small_mask] = _tukeylambda_kurt_p(small) / _tukeylambda_kurt_q(small) + if reg.size > 0: + numer = (1.0 / (4 * reg + 1) - 4 * beta(3 * reg + 1, reg + 1) + + 3 * beta(2 * reg + 1, 2 * reg + 1)) + denom = 2 * (1.0/(2 * reg + 1) - beta(reg + 1, reg + 1))**2 + k[reg_mask] = numer / denom - 3 + + # The return value will be a numpy array; resetting the shape ensures that + # if `lam` was a scalar, the return value is a 0-d array. + k.shape = shp + return k diff --git a/pywafo/src/wafo/stats/contingency.py b/pywafo/src/wafo/stats/contingency.py new file mode 100644 index 0000000..226a5b1 --- /dev/null +++ b/pywafo/src/wafo/stats/contingency.py @@ -0,0 +1,271 @@ +"""Some functions for working with contingency tables (i.e. cross tabulations). +""" + + +from __future__ import division, print_function, absolute_import + +from functools import reduce +import numpy as np +from .stats import power_divergence + + +__all__ = ['margins', 'expected_freq', 'chi2_contingency'] + + +def margins(a): + """Return a list of the marginal sums of the array `a`. + + Parameters + ---------- + a : ndarray + The array for which to compute the marginal sums. + + Returns + ------- + margsums : list of ndarrays + A list of length `a.ndim`. `margsums[k]` is the result + of summing `a` over all axes except `k`; it has the same + number of dimensions as `a`, but the length of each axis + except axis `k` will be 1. + + Examples + -------- + >>> a = np.arange(12).reshape(2, 6) + >>> a + array([[ 0, 1, 2, 3, 4, 5], + [ 6, 7, 8, 9, 10, 11]]) + >>> m0, m1 = margins(a) + >>> m0 + array([[15], + [51]]) + >>> m1 + array([[ 6, 8, 10, 12, 14, 16]]) + + >>> b = np.arange(24).reshape(2,3,4) + >>> m0, m1, m2 = margins(b) + >>> m0 + array([[[ 66]], + [[210]]]) + >>> m1 + array([[[ 60], + [ 92], + [124]]]) + >>> m2 + array([[[60, 66, 72, 78]]]) + """ + margsums = [] + ranged = list(range(a.ndim)) + for k in ranged: + marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k]) + margsums.append(marg) + return margsums + + +def expected_freq(observed): + """ + Compute the expected frequencies from a contingency table. + + Given an n-dimensional contingency table of observed frequencies, + compute the expected frequencies for the table based on the marginal + sums under the assumption that the groups associated with each + dimension are independent. + + Parameters + ---------- + observed : array_like + The table of observed frequencies. (While this function can handle + a 1-D array, that case is trivial. Generally `observed` is at + least 2-D.) + + Returns + ------- + expected : ndarray of float64 + The expected frequencies, based on the marginal sums of the table. + Same shape as `observed`. + + Examples + -------- + >>> observed = np.array([[10, 10, 20],[20, 20, 20]]) + >>> expected_freq(observed) + array([[ 12., 12., 16.], + [ 18., 18., 24.]]) + + """ + # Typically `observed` is an integer array. If `observed` has a large + # number of dimensions or holds large values, some of the following + # computations may overflow, so we first switch to floating point. + observed = np.asarray(observed, dtype=np.float64) + + # Create a list of the marginal sums. + margsums = margins(observed) + + # Create the array of expected frequencies. The shapes of the + # marginal sums returned by apply_over_axes() are just what we + # need for broadcasting in the following product. + d = observed.ndim + expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1) + return expected + + +def chi2_contingency(observed, correction=True, lambda_=None): + """Chi-square test of independence of variables in a contingency table. + + This function computes the chi-square statistic and p-value for the + hypothesis test of independence of the observed frequencies in the + contingency table [1]_ `observed`. The expected frequencies are computed + based on the marginal sums under the assumption of independence; see + `scipy.stats.contingency.expected_freq`. The number of degrees of + freedom is (expressed using numpy functions and attributes):: + + dof = observed.size - sum(observed.shape) + observed.ndim - 1 + + + Parameters + ---------- + observed : array_like + The contingency table. The table contains the observed frequencies + (i.e. number of occurrences) in each category. In the two-dimensional + case, the table is often described as an "R x C table". + correction : bool, optional + If True, *and* the degrees of freedom is 1, apply Yates' correction + for continuity. The effect of the correction is to adjust each + observed value by 0.5 towards the corresponding expected value. + lambda_ : float or str, optional. + By default, the statistic computed in this test is Pearson's + chi-squared statistic [2]_. `lambda_` allows a statistic from the + Cressie-Read power divergence family [3]_ to be used instead. See + `power_divergence` for details. + + Returns + ------- + chi2 : float + The test statistic. + p : float + The p-value of the test + dof : int + Degrees of freedom + expected : ndarray, same shape as `observed` + The expected frequencies, based on the marginal sums of the table. + + See Also + -------- + contingency.expected_freq + fisher_exact + chisquare + power_divergence + + Notes + ----- + An often quoted guideline for the validity of this calculation is that + the test should be used only if the observed and expected frequency in + each cell is at least 5. + + This is a test for the independence of different categories of a + population. The test is only meaningful when the dimension of + `observed` is two or more. Applying the test to a one-dimensional + table will always result in `expected` equal to `observed` and a + chi-square statistic equal to 0. + + This function does not handle masked arrays, because the calculation + does not make sense with missing values. + + Like stats.chisquare, this function computes a chi-square statistic; + the convenience this function provides is to figure out the expected + frequencies and degrees of freedom from the given contingency table. + If these were already known, and if the Yates' correction was not + required, one could use stats.chisquare. That is, if one calls:: + + chi2, p, dof, ex = chi2_contingency(obs, correction=False) + + then the following is true:: + + (chi2, p) == stats.chisquare(obs.ravel(), f_exp=ex.ravel(), + ddof=obs.size - 1 - dof) + + The `lambda_` argument was added in version 0.13.0 of scipy. + + References + ---------- + .. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table + .. [2] "Pearson's chi-squared test", + http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test + .. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit + Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984), + pp. 440-464. + + Examples + -------- + A two-way example (2 x 3): + + >>> obs = np.array([[10, 10, 20], [20, 20, 20]]) + >>> chi2_contingency(obs) + (2.7777777777777777, + 0.24935220877729619, + 2, + array([[ 12., 12., 16.], + [ 18., 18., 24.]])) + + Perform the test using the log-likelihood ratio (i.e. the "G-test") + instead of Pearson's chi-squared statistic. + + >>> g, p, dof, expctd = chi2_contingency(obs, lambda_="log-likelihood") + >>> g, p + (2.7688587616781319, 0.25046668010954165) + + A four-way example (2 x 2 x 2 x 2): + + >>> obs = np.array( + ... [[[[12, 17], + ... [11, 16]], + ... [[11, 12], + ... [15, 16]]], + ... [[[23, 15], + ... [30, 22]], + ... [[14, 17], + ... [15, 16]]]]) + >>> chi2_contingency(obs) + (8.7584514426741897, + 0.64417725029295503, + 11, + array([[[[ 14.15462386, 14.15462386], + [ 16.49423111, 16.49423111]], + [[ 11.2461395 , 11.2461395 ], + [ 13.10500554, 13.10500554]]], + [[[ 19.5591166 , 19.5591166 ], + [ 22.79202844, 22.79202844]], + [[ 15.54012004, 15.54012004], + [ 18.10873492, 18.10873492]]]])) + """ + observed = np.asarray(observed) + if np.any(observed < 0): + raise ValueError("All values in `observed` must be nonnegative.") + if observed.size == 0: + raise ValueError("No data; `observed` has size 0.") + + expected = expected_freq(observed) + if np.any(expected == 0): + # Include one of the positions where expected is zero in + # the exception message. + zeropos = list(np.where(expected == 0)[0]) + raise ValueError("The internally computed table of expected " + "frequencies has a zero element at %s." % zeropos) + + # The degrees of freedom + dof = expected.size - sum(expected.shape) + expected.ndim - 1 + + if dof == 0: + # Degenerate case; this occurs when `observed` is 1D (or, more + # generally, when it has only one nontrivial dimension). In this + # case, we also have observed == expected, so chi2 is 0. + chi2 = 0.0 + p = 1.0 + else: + if dof == 1 and correction: + # Adjust `observed` according to Yates' correction for continuity. + observed = observed + 0.5 * np.sign(expected - observed) + + chi2, p = power_divergence(observed, expected, + ddof=observed.size - 1 - dof, axis=None, + lambda_=lambda_) + + return chi2, p, dof, expected diff --git a/pywafo/src/wafo/stats/core.py b/pywafo/src/wafo/stats/core.py index 2818695..86a325a 100644 --- a/pywafo/src/wafo/stats/core.py +++ b/pywafo/src/wafo/stats/core.py @@ -1,1371 +1,1426 @@ -from __future__ import division -import warnings -from wafo.wafodata import PlotData -from wafo.misc import findextrema -from scipy import special -import numpy as np -from numpy import inf -from numpy import atleast_1d, nan, ndarray, sqrt, vstack, ones, where, zeros -from numpy import arange, floor, linspace, asarray #, reshape, repeat, product -from time import gmtime, strftime - - -__all__ = ['edf', 'edfcnd','reslife', 'dispersion_idx','decluster','findpot', - 'declustering_time','interexceedance_times', 'extremal_idx'] - -arr = asarray - -def now(): - ''' - Return current date and time as a string - ''' - return strftime("%a, %d %b %Y %H:%M:%S", gmtime()) - -def valarray(shape, value=nan, typecode=None): - """Return an array of all value. - """ - #out = reshape(repeat([value], product(shape, axis=0), axis=0), shape) - out = ones(shape, dtype=bool) * value - if typecode is not None: - out = out.astype(typecode) - if not isinstance(out, ndarray): - out = arr(out) - return out -def _cdff(self, x, dfn, dfd): - return special.fdtr(dfn, dfd, x) -def _cdft(x,df): - return special.stdtr(df, x) -def _invt(q, df): - return special.stdtrit(df, q) -def _cdfchi2(x, df): - return special.chdtr(df, x) -def _invchi2(q, df): - return special.chdtri(df, q) -def _cdfnorm(x): - return special.ndtr(x) -def _invnorm(q): - return special.ndtri(q) - -def edf(x, method=2): - ''' - Returns Empirical Distribution Function (EDF). - - Parameters - ---------- - x : array-like - data vector - method : integer scalar - 1. Interpolation so that F(X_(k)) == (k-0.5)/n. - 2. Interpolation so that F(X_(k)) == k/(n+1). (default) - 3. The empirical distribution. F(X_(k)) = k/n - - Example - ------- - >>> import wafo.stats as ws - >>> x = np.linspace(0,6,200) - >>> R = ws.rayleigh.rvs(scale=2,size=100) - >>> F = ws.edf(R) - >>> h = F.plot() - - See also edf, pdfplot, cumtrapz - ''' - z = atleast_1d(x) - z.sort() - - N = len(z) - if method == 1: - Fz1 = arange(0.5, N) / N - elif method == 3: - Fz1 = arange(1, N + 1) / N - else: - Fz1 = arange(1, N + 1) / (N + 1) - - F = PlotData(Fz1, z, xlab='x', ylab='F(x)') - F.setplotter('step') - return F - -def edfcnd(x, c=None, method=2): - ''' - Returns empirical Distribution Function CoNDitioned that X>=c (EDFCND). - - Parameters - ---------- - x : array-like - data vector - method : integer scalar - 1. Interpolation so that F(X_(k)) == (k-0.5)/n. - 2. Interpolation so that F(X_(k)) == k/(n+1). (default) - 3. The empirical distribution. F(X_(k)) = k/n - - Example - ------- - >>> import wafo.stats as ws - >>> x = np.linspace(0,6,200) - >>> R = ws.rayleigh.rvs(scale=2,size=100) - >>> Fc = ws.edfcnd(R, 1) - >>> hc = Fc.plot() - >>> F = ws.edf(R) - >>> h = F.plot() - - See also edf, pdfplot, cumtrapz - ''' - z = atleast_1d(x) - if c is None: - c = floor(min(z.min(), 0)) - - try: - F = edf(z[c <= z], method=method) - except: - ValueError('No data points above c=%d' % int(c)) - - if - inf < c: - F.labels.ylab = 'F(x| X>=%g)' % c - - return F - - -def reslife(data, u=None, umin=None, umax=None, nu=None, nmin=3, alpha=0.05, plotflag=False): - ''' - Return Mean Residual Life, i.e., mean excesses vs thresholds - - Parameters - --------- - data : array_like - vector of data of length N. - u : array-like - threshold values (default linspace(umin, umax, nu)) - umin, umax : real scalars - Minimum and maximum threshold, respectively (default min(data), max(data)). - nu : scalar integer - number of threshold values (default min(N-nmin,100)) - nmin : scalar integer - Minimum number of extremes to include. (Default 3). - alpha : real scalar - Confidence coefficient (default 0.05) - plotflag: bool - - - Returns - ------- - mrl : PlotData object - Mean residual life values, i.e., mean excesses over thresholds, u. - - Notes - ----- - RESLIFE estimate mean excesses over thresholds. The purpose of MRL is - to determine the threshold where the upper tail of the data can be - approximated with the generalized Pareto distribution (GPD). The GPD is - appropriate for the tail, if the MRL is a linear function of the - threshold, u. Theoretically in the GPD model - - E(X-u0|X>u0) = s0/(1+k) - E(X-u |X>u) = s/(1+k) = (s0 -k*u)/(1+k) for u>u0 - - where k,s is the shape and scale parameter, respectively. - s0 = scale parameter for threshold u0>> import wafo - >>> R = wafo.stats.genpareto.rvs(0.1,2,2,size=100) - >>> mrl = reslife(R,nu=20) - >>> h = mrl.plot() - - See also - --------- - genpareto - fitgenparrange, disprsnidx - ''' - if u is None: - sd = np.sort(data) - n = len(data) - - nmin = max(nmin, 0) - if 2 * nmin > n: - warnings.warn('nmin possibly too large!') - - sdmax, sdmin = sd[-nmin], sd[0] - umax = sdmax if umax is None else min(umax, sdmax) - umin = sdmin if umin is None else max(umin, sdmin) - - if nu is None: - nu = min(n - nmin, 100) - - u = linspace(umin, umax, nu) - - - nu = len(u) - - #mrl1 = valarray(nu) - #srl = valarray(nu) - #num = valarray(nu) - - mean_and_std = lambda data1 : (data1.mean(), data1.std(), data1.size) - dat = arr(data) - tmp = arr([mean_and_std(dat[dat > tresh] - tresh) for tresh in u.tolist()]) - - mrl, srl, num = tmp.T - p = 1 - alpha - alpha2 = alpha / 2 - - # Approximate P% confidence interval - #%Za = -invnorm(alpha2); % known mean - Za = -_invt(alpha2, num - 1) # unknown mean - mrlu = mrl + Za * srl / sqrt(num) - mrll = mrl - Za * srl / sqrt(num) - - #options.CI = [mrll,mrlu]; - #options.numdata = num; - titleTxt = 'Mean residual life with %d%s CI' % (100 * p, '%') - res = PlotData(mrl, u, xlab='Threshold', ylab='Mean Excess', title=titleTxt) - res.workspace = dict(numdata=num, umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha) - res.children = [PlotData(vstack([mrll, mrlu]).T, u, xlab='Threshold', title=titleTxt)] - res.plot_args_children = [':r'] - if plotflag: - res.plot() - return res - -def dispersion_idx(data, t=None, u=None, umin=None, umax=None, nu=None, nmin=10, tb=1, - alpha=0.05, plotflag=False): - '''Return Dispersion Index vs threshold - - Parameters - ---------- - data, ti : array_like - data values and sampled times, respectively. - u : array-like - threshold values (default linspace(umin, umax, nu)) - umin, umax : real scalars - Minimum and maximum threshold, respectively (default min(data), max(data)). - nu : scalar integer - number of threshold values (default min(N-nmin,100)) - nmin : scalar integer - Minimum number of extremes to include. (Default 10). - tb : Real scalar - Block period (same unit as the sampled times) (default 1) - alpha : real scalar - Confidence coefficient (default 0.05) - plotflag: bool - - Returns - ------- - DI : PlotData object - Dispersion index - b_u : real scalar - threshold where the number of exceedances in a fixed period (Tb) is - consistent with a Poisson process. - ok_u : array-like - all thresholds where the number of exceedances in a fixed period (Tb) is - consistent with a Poisson process. - Notes - ------ - DISPRSNIDX estimate the Dispersion Index (DI) as function of threshold. - DI measures the homogenity of data and the purpose of DI is to determine - the threshold where the number of exceedances in a fixed period (Tb) is - consistent with a Poisson process. For a Poisson process the DI is one. - Thus the threshold should be so high that DI is not significantly - different from 1. - - The Poisson hypothesis is not rejected if the estimated DI is between: - - chi2(alpha/2, M-1)/(M-1)< DI < chi^2(1 - alpha/2, M-1 }/(M - 1) - - where M is the total number of fixed periods/blocks -generally - the total number of years in the sample. - - Example - ------- - >>> import wafo.data - >>> xn = wafo.data.sea() - >>> t, data = xn.T - >>> Ie = findpot(data,t,0,5); - >>> di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100) - >>> h = di.plot() # a threshold around 1 seems appropriate. - >>> round(u*100)/100 - 1.03 - - vline(u) - - See also - -------- - reslife, - fitgenparrange, - extremal_idx - - - References - ---------- - Ribatet, M. A.,(2006), - A User's Guide to the POT Package (Version 1.0) - month = {August}, - url = {http://cran.r-project.org/} - - Cunnane, C. (1979) Note on the poisson assumption in - partial duration series model. Water Resource Research, 15\bold{(2)} - :489--494.} - ''' - -# This program is free software; you can redistribute it and/or modify it under the terms of the GNU -# General Public License as published by the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# This program is distributed in the hope that it will be useful, but without any warranty; without even -# the implied warranty of merchantability or fitness for a particular purpose. See the GNU General Public -# License for moredetails. -# The GNU General Public License can be obtained from http://www.gnu.org/copyleft/gpl.html. You -# can also obtain it by writing to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, -# MA 02111-1307, USA. - - - - n = len(data) - if t is None: - ti = arange(n) - else: - ti = arr(t) - min(t) - - t1 = np.empty(ti.shape,dtype=int) - t1[:] = np.floor(ti / tb) - - - if u is None: - sd = np.sort(data) - - - nmin = max(nmin, 0) - if 2 * nmin > n: - warnings.warn('nmin possibly too large!') - - sdmax, sdmin = sd[-nmin], sd[0] - umax = sdmax if umax is None else min(umax, sdmax) - umin = sdmin if umin is None else max(umin, sdmin) - - if nu is None: - nu = min(n - nmin, 100) - - u = linspace(umin, umax, nu) - - - - nu = len(u) - - di = np.zeros(nu) - - d = arr(data) - - mint = int(min(t1)) #; % mint should be 0. - maxt = int(max(t1)) - M = maxt - mint + 1; - occ = np.zeros(M); - - for ix, tresh in enumerate(u.tolist()): - excess = (d > tresh) - lambda_ = excess.sum() / M - for block in range(M): - occ[block] = sum(excess[t1 == block]) - - di[ix] = occ.var() / lambda_ - - p = 1 - alpha - - diLo = _invchi2(1 - alpha / 2, M - 1) / (M - 1) - diUp = _invchi2(alpha / 2, M - 1) / (M - 1) - - # Find appropriate threshold - k1, = np.where((diLo < di) & (di < diUp)) - if len(k1) > 0: - ok_u = u[k1] - b_di = (di[k1].mean() < di[k1]) - k = b_di.argmax() - b_u = ok_u[k] - else: - b_u = ok_u = None - - CItxt = '%d%s CI' % (100 * p, '%') - titleTxt = 'Dispersion Index plot'; - - res = PlotData(di, u, title=titleTxt, labx='Threshold', laby='Dispersion Index') - #'caption',CItxt); - res.workspace = dict(umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha) - res.children = [PlotData(vstack([diLo * ones(nu), diUp * ones(nu)]).T, u, xlab='Threshold', title=CItxt)] - res.plot_args_children = ['--r'] - if plotflag: - res.plot(di) - return res, b_u, ok_u - -def decluster(data, t=None, thresh=None, tmin=1): - ''' - Return declustered peaks over threshold values - - Parameters - ---------- - data, t : array-like - data-values and sampling-times, respectively. - thresh : real scalar - minimum threshold for levels in data. - tmin : real scalar - minimum distance to another peak [same unit as t] (default 1) - - Returns - ------- - ev, te : ndarray - extreme values and its corresponding sampling times, respectively, i.e., - all data > thresh which are at least tmin distance apart. - - Example - ------- - >>> import pylab - >>> import wafo.data - >>> from wafo.misc import findtc - >>> x = wafo.data.sea() - >>> t, data = x[:400,:].T - >>> itc, iv = findtc(data,0,'dw') - >>> ytc, ttc = data[itc], t[itc] - >>> ymin = 2*data.std() - >>> tmin = 10 # sec - >>> [ye, te] = decluster(ytc,ttc, ymin,tmin); - >>> h = pylab.plot(t,data,ttc,ytc,'ro',t,zeros(len(t)),':',te,ye,'k.') - - See also - -------- - fitgenpar, findpot, extremalidx - ''' - if t is None: - t = np.arange(len(data)) - i = findpot(data, t, thresh, tmin) - return data[i], t[i] - -def findpot(data, t=None, thresh=None, tmin=1): - ''' - Retrun indices to Peaks over threshold values - - Parameters - ---------- - data, t : array-like - data-values and sampling-times, respectively. - thresh : real scalar - minimum threshold for levels in data. - tmin : real scalar - minimum distance to another peak [same unit as t] (default 1) - - Returns - ------- - Ie : ndarray - indices to extreme values, i.e., all data > tresh which are at least - tmin distance apart. - - Example - ------- - >>> import pylab - >>> import wafo.data - >>> from wafo.misc import findtc - >>> x = wafo.data.sea() - >>> t, data = x.T - >>> itc, iv = findtc(data,0,'dw') - >>> ytc, ttc = data[itc], t[itc] - >>> ymin = 2*data.std() - >>> tmin = 10 # sec - >>> I = findpot(data, t, ymin, tmin) - >>> yp, tp = data[I], t[I] - >>> Ie = findpot(yp, tp, ymin,tmin) - >>> ye, te = yp[Ie], tp[Ie] - >>> h = pylab.plot(t,data,ttc,ytc,'ro',t,zeros(len(t)),':',te, ye,'k.',tp,yp,'+') - - See also - -------- - fitgenpar, decluster, extremalidx - ''' - Data = arr(data) - if t is None: - ti = np.arange(len(Data)) - else: - ti = arr(t) - - Ie, = where(Data > thresh); - Ye = Data[Ie] - Te = ti[Ie] - if len(Ye) <= 1: - return Ie - - dT = np.diff(Te) - notSorted = np.any(dT < 0); - if notSorted: - I = np.argsort(Te) - Te = Te[I] - Ie = Ie[I] - Ye = Ye[I] - dT = np.diff(Te) - - isTooSmall = (dT <= tmin) - - if np.any(isTooSmall): - isTooClose = np.hstack((isTooSmall[0], isTooSmall[:-1] | isTooSmall[1:], isTooSmall[-1])) - - #Find opening (NO) and closing (NC) index for data beeing to close: - iy = findextrema(np.hstack([0, 0, isTooSmall, 0])) - - NO = iy[::2] - 1 - NC = iy[1::2] - - for no, nc in zip(NO, NC): - iz = slice(no, nc) - iOK = _find_ok_peaks(Ye[iz], Te[iz], tmin) - if len(iOK): - isTooClose[no + iOK] = 0 - # Remove data which is too close to other data. - if isTooClose.any(): - #len(tooClose)>0: - iOK, = where(1 - isTooClose) - Ie = Ie[iOK] - - return Ie - - -def _find_ok_peaks(Ye, Te, Tmin): - ''' - Return indices to the largest maxima that are at least Tmin - distance apart. - ''' - Ny = len(Ye) - - I = np.argsort(-Ye) # sort in descending order - - Te1 = Te[I] - oOrder = zeros(Ny, dtype=int) - oOrder[I] = range(Ny) #indices to the variables original location - - isTooClose = zeros(Ny, dtype=bool) - - pool = zeros((Ny, 2)) - T_range = np.hstack([-Tmin, Tmin]) - K = 0 - for i, ti in enumerate(Te1): - isTooClose[i] = np.any((pool[:K, 0] <= ti) & (ti <= pool[:K, 1])) - if not isTooClose[i]: - pool[K] = ti + T_range - K += 1 - - iOK, = where(1 - isTooClose[oOrder]) - return iOK - -def declustering_time(t): - ''' - Returns minimum distance between clusters. - - Parameters - ---------- - t : array-like - sampling times for data. - - Returns - ------- - tc : real scalar - minimum distance between clusters. - - Example - ------- - >>> import wafo.data - >>> x = wafo.data.sea() - >>> t, data = x[:400,:].T - >>> Ie = findpot(data,t,0,5); - >>> tc = declustering_time(Ie) - >>> tc - 21 - - ''' - t0 = arr(t) - nt = len(t0) - if nt<2: - return arr([]) - ti = interexceedance_times(t0) - ei = extremal_idx(ti) - if ei==1: - tc = ti.min() - else: - i = int(np.floor(nt*ei)) - sti = -np.sort(-ti) - tc = sti[min(i, nt-2)] #% declustering time - return tc - - -def interexceedance_times(t): - ''' - Returns interexceedance times of data - - Parameters - ---------- - t : array-like - sampling times for data. - Returns - ------- - ti : ndarray - interexceedance times - - Example - ------- - >>> t = [1,2,5,10] - >>> interexceedance_times(t) - array([1, 3, 5]) - - ''' - return np.diff(np.sort(t)) - -def extremal_idx(ti): - ''' - Returns Extremal Index measuring the dependence of data - - Parameters - ---------- - ti : array-like - interexceedance times for data. - - Returns - ------- - ei : real scalar - Extremal index. - - Notes - ----- - The Extremal Index (EI) is one if the data are independent and less than - one if there are some dependence. The extremal index can also be intepreted - as the reciprocal of the mean cluster size. - - Example - ------- - >>> import wafo.data - >>> x = wafo.data.sea() - >>> t, data = x[:400,:].T - >>> Ie = findpot(data,t,0,5); - >>> ti = interexceedance_times(Ie) - >>> ei = extremal_idx(ti) - >>> ei - 1 - - See also - -------- - reslife, fitgenparrange, disprsnidx, findpot, decluster - - - Reference - --------- - Christopher A. T. Ferro, Johan Segers (2003) - Inference for clusters of extreme values - Journal of the Royal Statistical society: Series B (Statistical Methodology) 54 (2), 545-556 - doi:10.1111/1467-9868.00401 - ''' - t = arr(ti) - tmax = t.max() - if tmax<=1: - ei = 0 - elif tmax<=2: - ei = min(1, 2*t.mean()**2/((t**2).mean())) - else: - ei = min(1, 2*np.mean(t-1)**2/np.mean((t-1)*(t-2))) - return ei - -def _logit(p): - return np.log(p)-np.log1p(-p) -def _logitinv(x): - return 1.0/(np.exp(-x)+1) - -class RegLogit(object): - ''' - REGLOGIT Fit ordinal logistic regression model. - - CALL model = reglogit (options) - - model = fitted model object with methods - .compare() : Compare small LOGIT object versus large one - .predict() : Predict from a fitted LOGIT object - .summary() : Display summary of fitted LOGIT object. - - y = vector of K ordered categories - x = column vectors of covariates - options = struct defining performance of REGLOGIT - .maxiter : maximum number of iterations. - .accuracy : accuracy in convergence. - .betastart : Start value for BETA (default 0) - .thetastart : Start value for THETA (default depends on Y) - .alpha : Confidence coefficent (default 0.05) - .verbose : 1 display summary info about fitted model - 2 display convergence info in each iteration - otherwise no action - .deletecolinear : If true delete colinear covarites (default) - - - Methods - .predict : Predict from a fitted LOGIT object - .summary : Display summary of fitted LOGIT object. - .compare : Compare small LOGIT versus large one - - - - - Suppose Y takes values in K ordered categories, and let - gamma_i (x) be the cumulative probability that Y - falls in one of the first i categories given the covariate - X. The ordinal logistic regression model is - - logit (mu_i (x)) = theta_i + beta' * x, i = 1...k-1 - - The number of ordinal categories, K, is taken to be the number - of distinct values of round (Y). If K equals 2, - Y is binary and the model is ordinary logistic regression. The - matrix X is assumed to have full column rank. - - Given Y only, theta = REGLOGIT(Y) fits the model with baseline logit odds - only. - - Example - y=[1 1 2 1 3 2 3 2 3 3]' - x = (1:10)' - b = reglogit(y,x) - b.display() % members and methods - b.get() % return members - b.summary() - [mu,plo,pup] = b.predict(); - plot(x,mu,'g',x,plo,'r:',x,pup,'r:') - - y2 = [zeros(5,1);ones(5,1)]; - x1 = [29,30,31,31,32,29,30,31,32,33]; - x2 = [62,83,74,88,68,41,44,21,50,33]; - X = [x1;x2].'; - b2 = reglogit(y2,X); - b2.summary(); - b21 = reglogit(y2,X(:,1)); - b21.compare(b2) - - See also regglm, reglm, regnonlm - ''' - - #% Original for MATLAB written by Gordon K Smyth , - #% U of Queensland, Australia, on Nov 19, 1990. Last revision Aug 3, - #% 1992. - # - #% Author: Gordon K Smyth , - #% Revised by: pab - #% -renamed from oridinal to reglogit - #% -added predict, summary and compare - #% Description: Ordinal logistic regression - # - #% Uses the auxiliary functions logistic_regression_derivatives and - #% logistic_regression_likelihood. - - def __init__(self, maxiter=500,accuracy=1e-6, alpha=0.05, deletecolinear=True, verbose=False): - - self.maxiter =maxiter - self.accuracy = accuracy - self.alpha = alpha - self.deletecolinear = deletecolinear - self.verbose = False - self.family = None - self.link = None - self.numvar = None - self.numobs = None - self.numk = None - self.df = None - self.df_null = None - self.params = None - self.params_ci = None - self.params_cov = None - self.params_std = None - self.params_corr = None - self.params_tstat = None - self.params_pvalue = None - self.mu = None - self.eta = None - self.X = None - self.Y = None - self.theta = None - self.beta = None - self.residual = None - self.residual1d = None - self.deviance = None - self.deviance_null = None - self.d2L = None - self.dL = None - self.dispersionfit = None - self.dispersion = 1 - self.R2 = None - self.R2adj = None - self.numiter = None - self.converged = None - self.note = '' - self.date = now() - - def check_xy(self, y, X): - y = np.round(np.atleast_2d(y)) - my = y.shape[0] - if X is None: - X = np.zeros((my, 0)) - elif self.deletecolinear: - X = np.atleast_2d(X) - # Make sure X is full rank - s = np.linalg.svd(X)[1] - tol = max(X.shape) * np.finfo(s.max()).eps - ix = np.flatnonzero(s>tol) - iy = np.flatnonzero(s<=tol) - if len(ix): - X = X[:, ix] - txt = [' %d,' % i for i in iy] - #txt[-1] = ' %d' % iy[-1] - warnings.warn('Covariate matrix is singular. Removing column(s):%s' % txt) - mx = X.shape[0] - if (mx != my): - raise ValueError('x and y must have the same number of observations'); - return y, X - - - def fit(self, y, X=None, theta0=None, beta0=None): - ''' - Member variables - .df : degrees of freedom for error. - .params : estimated model parameters - .params_ci : 100(1-alpha)% confidence interval for model parameters - .params_tstat : t statistics for model's estimated parameters. - .params_pvalue: p value for model's estimated parameters. - .params_std : standard errors for estimated parameters - .params_corr : correlation matrix for estimated parameters. - .mu : fitted values for the model. - .eta : linear predictor for the model. - .residual : residual for the model (Y-E(Y|X)). - .dispersnfit : The estimated error variance - .deviance : deviance for the model equal minus twice the log-likelihood. - .d2L : Hessian matrix (double derivative of log-likelihood) - .dL : First derivative of loglikelihood w.r.t. THETA and BETA. - - ''' - self.family = 'multinomial'; - self.link = 'logit'; - y, X = self.check_xy(y, X) - - - # initial calculations - tol = self.accuracy - incr = 10 - decr = 2 - ymin = y.min() - ymax = y.max() - yrange = ymax - ymin - z = (y * ones((1, yrange))) == ((y * 0 + 1) * np.arange(ymin, ymax)) - z1 = (y * ones((1, yrange))) == ((y * 0 + 1) * np.arange(ymin + 1, ymax+1)) - z = z[:, np.flatnonzero(z.any(axis=0))]; - z1 = z1[:, np.flatnonzero(z1.any(axis=0))] - [_mz, nz] = z.shape - [_mx, nx] = X.shape - [my, _ny] = y.shape - - g = (z.sum(axis=0).cumsum() / my).reshape(-1,1) - theta00 = np.log(g / (1 - g)).ravel() - beta00 = np.zeros((nx,)) - # starting values - if theta0 is None: - theta0 = theta00 - - if beta0 is None: - beta0 = beta00 - - tb = np.hstack((theta0, beta0)) - - # likelihood and derivatives at starting values - [dev, dl, d2l] = self.loglike(tb, y, X, z, z1) - - epsilon = np.std(d2l) / 1000; - if np.any(beta0) or np.any(theta00!=theta0): - tb0 = np.vstack((theta00,beta00)) - nulldev = self.loglike (tb0, y, X, z, z1)[0] - else: - nulldev = dev - - # maximize likelihood using Levenberg modified Newton's method - for i in range(self.maxiter+1): - - tbold = tb; - devold = dev; - tb = tbold - np.linalg.lstsq(d2l, dl)[0] - [dev, dl, d2l] = self.loglike(tb, y, X, z, z1) - if ((dev - devold) / np.dot(dl, tb - tbold) < 0): - epsilon = epsilon / decr - else: - while ((dev - devold) / np.dot(dl, tb - tbold) > 0): - epsilon = epsilon * incr; - if (epsilon > 1e+15): - raise ValueError('epsilon too large'); - - tb = tbold - np.linalg.lstsq(d2l - epsilon * np.eye(d2l.shape), dl) - [dev, dl, d2l] = self.loglike(tb, y, X, z, z1); - print('epsilon %g' % epsilon) - #end %while - #end else - #[dl, d2l] = logistic_regression_derivatives (X, z, z1, g, g1, p); - if (self.verbose>1): - - print('Iter: %d, Deviance: %8.6f',iter,dev) - print('First derivative'); - print(dl); - print('Eigenvalues of second derivative'); - print(np.linalg.eig(d2l)[0].T); - #end - #end - stop = np.abs(np.dot(dl, np.linalg.lstsq(d2l, dl)[0]) / len(dl)) <= tol - if stop: - break - #end %while - - #% tidy up output - - theta = tb[:nz,] - beta = tb[nz:(nz + nx)] - pcov = np.linalg.pinv(-d2l) - se = sqrt(np.diag(pcov)) - - - if (nx > 0): - eta = ((X * beta) * ones((1, nz))) + ((y * 0 + 1) * theta) - else: - eta = (y * 0 + 1) * theta; - #end - gammai = np.diff(np.hstack(((y * 0), _logitinv(eta), (y * 0 + 1))),n=1,axis=1) - k0 = min(y) - mu = (k0-1)+np.dot(gammai,np.arange(1, nz+2)).reshape(-1,1) - r = np.corrcoef(np.hstack((y,mu)).T) - R2 = r[0,1]**2; #coefficient of determination - R2adj = max(1 - (1-R2)* (my-1)/(my-nx-nz-1),0); # adjusted coefficient of determination - - res = y-mu - - if nz==1: - self.family = 'binomial'; - else: - self.family = 'multinomial'; - - self.link = 'logit'; - - self.numvar = nx+nz - self.numobs = my - self.numk = nz+1 - self.df = max(my-nx-nz,0) - self.df_null = my-nz; #nulldf; nulldf = n - nz; - self.params = tb[:(nz + nx)] - self.params_ci = 1 - self.params_std = se - self.params_cov = pcov - self.params_tstat = (self.params/self.params_std) - if False: # % options.estdispersn %dispersion_parameter=='mean_deviance' - self.params_pvalue=2.*_cdft(-abs(self.params_tstat),self.df) - bcrit = -se*_invt(self.alpha/2,self.df); - else: - self.params_pvalue=2.*_cdfnorm(-abs(self.params_tstat)) - bcrit = -se*_invnorm(self.alpha/2); - #end - self.params_ci = np.vstack((self.params+bcrit,self.params-bcrit)) - - self.mu = gammai; - self.eta = _logit(gammai); - self.X = X; - [dev, dl, d2l, p] = self.loglike(tb, y, X, z, z1,numout=4) - self.theta = theta; - self.beta = beta; - self.gamma = gammai; - self.residual = res.T; - self.residualD = np.sign(self.residual)*sqrt(-2*np.log(p)); - self.deviance = dev; - self.deviance_null = nulldev; - self.d2L = d2l; - self.dL = dl.T; - self.dispersionfit=1; - self.dispersion = 1; - self.R2 = R2; - self.R2adj = R2adj; - self.numiter = i - self.converged = iobject2.numvar: - devL = self.deviance; - nL = self.numvar; - dfL = self.df; - Al = self.X; - disprsn = self.dispersionfit; - devs = object2.deviance; - ns = object2.numvar; - dfs = object2.df; - As = object2.X; - else: - devL = object2.deviance; - nL = object2.numvar; - dfL = object2.df; - Al = object2.X; - disprsn = object2.dispersionfit; - devs = self.deviance; - ns = self.numvar; - dfs = self.df; - As = self.X; - #end - - if (((As-np.dot(Al*np.linalg.lstsq(Al,As)))>500*np.finfo(float).eps).any() or - object2.family!=self.family or object2.link!=self.link): - warnings.warn('Small model not included in large model, result is rubbish!') - - - except: - raise ValueError('Apparently not a valid regression object') - - - - pmq = np.abs(nL-ns); - print(' ') - print(' Analysis of Deviance') - if False: # %options.estdispersn - localstat = abs(devL-devs)/disprsn/pmq; -# localpvalue = 1-cdff(localstat, pmq, dfL) -# print('Model DF Residual deviance F-stat Pr(>F)') - else: - localstat = abs(devL-devs)/disprsn; - localpvalue = 1-_cdfchi2(localstat,pmq) - print('Model DF Residual deviance Chi2-stat Pr(>Chi2)') - #end - - - print('Small %d %12.4f %12.4f %12.4f' % (dfs,devs,localstat,localpvalue)) - print('Full %d %12.4f' % (dfL,devL)) - print(' ') - - return localpvalue - - def anode(self): - print(' ') - print(' Analysis of Deviance') - if False: # %options.estdispersn - localstat = abs(self.deviance_null-self.deviance)/self.dispersionfit/(self.numvar-1); - localpvalue = 1-_cdff(localstat,self.numvar-1,self.df); - print('Model DF Residual deviance F-stat Pr(>F)') - else: - localstat = abs(self.deviance_null-self.deviance)/self.dispersionfit; - localpvalue = 1-_cdfchi2(localstat,self.numvar-1); - print('Model DF Residual deviance Chi2-stat Pr(>Chi2)') - #end - - - print('Null %d %12.4f %12.4f %12.4f' % (self.df_null,self.deviance_null,localstat,localpvalue)) - print('Full %d %12.4f' % (self.df,self.deviance)) - print(' ') - - print(' R2 = %2.4f, R2adj = %2.4f' % (self.R2,self.R2adj)) - print(' ') - return localpvalue - def summary(self): - txtlink = self.link; - - print('Call:') - print('reglogit(formula = %s(Pr(grp(y)<=i)) ~ theta_i+beta*x, family = %s)' %(txtlink,self.family)) - print(' ') - print('Deviance Residuals:') - m,q1,me, q3,M = np.percentile(self.residualD,q=[0, 25, 50, 75, 100]) - print(' Min 1Q Median 3Q Max ') - print('%2.4f %2.4f %2.4f %2.4f %2.4f' % (m, q1, me, q3, M)) - print(' ') - print(' Coefficients:') - if False: # %options.estdispersn - print(' Estimate Std. Error t value Pr(>|t|)') - else: - print(' Estimate Std. Error z value Pr(>|z|)') - #end - e, s, z, p = self.params, self.params_std, self.params_tstat, self.params_pvalue - for i in range(self.numk): - print('theta_%d %2.4f %2.4f %2.4f %2.4f' % (i,e[i],s[i], z[i], p[i])) - - for i in range(self.numk, self.numvar): - print(' beta_%d %2.4f %2.4f %2.4f %2.4f\n' % (i-self.numk,e[i],s[i], z[i], p[i])) - - print(' ') - print('(Dispersion parameter for %s family taken to be %2.2f)' % (self.family,self.dispersionfit)) - print(' ') - if True: #%options.constant - print(' Null deviance: %2.4f on %d degrees of freedom' % (self.deviance_null,self.df_null)) - #end - print('Residual deviance: %2.4f on %d degrees of freedom' % (self.deviance,self.df)) - - self.anode() - - #end % summary - - def predict(self, Xnew=None,alpha=0.05, fulloutput=False): - '''LOGIT/PREDICT Predict from a fitted LOGIT object - - CALL [y,ylo,yup] = predict(Xnew,options) - - y = predicted value - ylo,yup = 100(1-alpha)% confidence interval for y - - Xnew = new covariate - options = options struct defining the calculation - .alpha : confidence coefficient (default 0.05) - .size : size if binomial family (default 1). - ''' - - [_mx, nx] = self.X.shape - if Xnew is None: - Xnew = self.X; - else: - Xnew = np.atleast_2d(Xnew) - notnans = np.flatnonzero(1-(1-np.isfinite(Xnew)).any(axis=1)) - Xnew = Xnew[notnans,:] - - [n,p] = Xnew.shape - - - if p != nx: - raise ValueError('Number of covariates must match the number of regression coefficients') - - - nz = self.numk-1; - one = ones((n,1)) - if (nx > 0): - eta = np.dot(Xnew, self.beta).reshape(-1,1) + self.theta - else: - eta = one * self.theta - #end - y = np.diff(np.hstack((zeros((n,1)), _logitinv(eta), one)),n=1, axis=1) - if fulloutput: - eps = np.finfo(float).eps - pcov = self.params_cov; - if (nx > 0): - np1 = pcov.shape[0] - - [U, S, V]= np.linalg.svd(pcov,0); - R = np.dot(U,np.dot(np.diag(sqrt(S)),V)) #%squareroot of pcov - ib = np.r_[0,nz:np1] - - #% Var(eta_i) = var(theta_i+Xnew*b) - vareta = zeros((n,nz)); - u = np.hstack((one,Xnew)) - for i in range(nz): - ib[0] = i - vareta[:,i] = np.maximum(((np.dot(u,R[ib][:,ib]))**2).sum(axis=1),eps) - #end - else: - vareta = np.diag(pcov) - #end - crit = -_invnorm(alpha/2); - - - ecrit = crit * sqrt(vareta); - mulo = _logitinv(eta-ecrit); - muup = _logitinv(eta+ecrit); - ylo1 = np.diff(np.hstack((zeros((n,1)), mulo , one)),n=1,axis=1) - yup1 = np.diff(np.hstack((zeros((n,1)), muup , one)),n=1,axis=1) - - ylo = np.minimum(ylo1,yup1) - yup = np.maximum(ylo1,yup1) - - for i in range(1, nz): #= 2:self.numk-1 - yup[:,i] = np.vstack((yup[:,i],muup[:,i]-mulo[:,i-1])).max(axis=0) - #end - return y,ylo,yup - return y - - def loglike(self, beta, y, x, z, z1, numout=3): - ''' - [dev, dl, d2l, p] = loglike( y ,x,beta,z,z1) - Calculates likelihood for the ordinal logistic regression model. - ''' - # Author: Gordon K. Smyth - zx = np.hstack((z, x)) - z1x = np.hstack((z1, x)) - g = _logitinv(np.dot(zx, beta)).reshape((-1,1)) - g1 = _logitinv(np.dot(z1x, beta)).reshape((-1,1)) - g = np.maximum(y == y.max(), g) - g1 = np.minimum(y > y.min(), g1) - - - p = g - g1 - dev = -2 * np.log(p).sum() - - '''[dl, d2l] = derivatives of loglike(beta, y, x, z, z1) - % Called by logistic_regression. Calculates derivates of the - % log-likelihood for ordinal logistic regression model. - ''' - # Author: Gordon K. Smyth - # Description: Derivates of log-likelihood in logistic regression - - - # first derivative - v = g * (1 - g) / p; - v1 = g1 * (1 - g1) / p; - dlogp = np.hstack((((v*z) - (v1*z1)), ((v - v1)*x))) - dl = np.sum(dlogp, axis=0) - - # second derivative - w = v * (1 - 2 * g) - w1 = v1 * (1 - 2 * g1) - d2l = np.dot(zx.T, (w*zx)) - np.dot(z1x.T, (w1*z1x)) - np.dot(dlogp.T, dlogp) - - if numout==4: - return dev, dl, d2l, p - else: - return dev, dl, d2l - #end %function - - -def _test_dispersion_idx(): - import wafo.data - xn = wafo.data.sea() - t, data = xn.T - Ie = findpot(data,t,0,5); - di, _u, _ok_u = dispersion_idx(data[Ie],t[Ie],tb=100) - di.plot() # a threshold around 1 seems appropriate. - di.show() - pass - -def _test_findpot(): - import pylab - import wafo.data - from wafo.misc import findtc - x = wafo.data.sea() - t, data = x[:, :].T - itc, _iv = findtc(data, 0, 'dw') - ytc, ttc = data[itc], t[itc] - ymin = 2 * data.std() - tmin = 10 # sec - I = findpot(data, t, ymin, tmin) - yp, tp = data[I], t[I] - Ie = findpot(yp, tp, ymin, tmin) - ye, te = yp[Ie], tp[Ie] - pylab.plot(t, data, ttc,ytc,'ro', t, zeros(len(t)), ':', te, ye, 'kx', tp, yp, '+') - pylab.show() # - pass - -def _test_reslife(): - import wafo - R = wafo.stats.genpareto.rvs(0.1, 2, 2, size=100) - mrl = reslife(R, nu=20) - mrl.plot() - -def test_reglogit(): - y=np.array([1, 1, 2, 1, 3, 2, 3, 2, 3, 3]).reshape(-1,1) - x = np.arange(1,11).reshape(-1,1) - b = RegLogit() - b.fit(y,x) - #b.display() #% members and methods - - b.summary() - [mu,plo,pup] = b.predict(fulloutput=True) #@UnusedVariable - pass - #plot(x,mu,'g',x,plo,'r:',x,pup,'r:') -def test_reglogit2(): - n = 40 - x = np.sort(5*np.random.rand(n, 1)-2.5, axis=0) - y = (np.cos(x)>2*np.random.rand(n,1)-1) - b = RegLogit() - b.fit(y,x) - #b.display() #% members and methods - b.summary() - [mu,plo,pup] = b.predict(fulloutput=True); - import matplotlib.pyplot as pl - pl.plot(x,mu,'g',x,plo,'r:',x,pup,'r:') - pl.show() - -def test_sklearn0(): - from sklearn.linear_model import LogisticRegression - from sklearn import datasets #@UnusedImport - - # FIXME: the iris dataset has only 4 features! -# iris = datasets.load_iris() -# X = iris.data -# y = iris.target - - X = np.sort(5*np.random.rand(40, 1)-2.5, axis=0) - y = (2*(np.cos(X)>2*np.random.rand(40, 1)-1)-1).ravel() - - score = [] - # Set regularization parameter - cvals = np.logspace(-1,1,5) - for C in cvals: - clf_LR = LogisticRegression(C=C, penalty='l2') - clf_LR.fit(X, y) - score.append(clf_LR.score(X,y)) - - #plot(cvals, score) - -def test_sklearn(): - X = np.sort(5*np.random.rand(40, 1)-2.5, axis=0) - y = (2*(np.cos(X)>2*np.random.rand(40, 1)-1)-1).ravel() - from sklearn.svm import SVR - - - - ############################################################################### - # look at the results - import pylab as pl - pl.scatter(X, .5*np.cos(X)+0.5, c='k', label='True model') - pl.hold('on') - cvals= np.logspace(-1,3,20) - score = [] - for c in cvals: - svr_rbf = SVR(kernel='rbf', C=c, gamma=0.1, probability=True) - svrf = svr_rbf.fit(X, y) - y_rbf = svrf.predict(X) - score.append(svrf.score(X,y)) - pl.plot(X, y_rbf, label='RBF model c=%g' % c) - pl.xlabel('data') - pl.ylabel('target') - pl.title('Support Vector Regression') - pl.legend() - pl.show() - -def test_sklearn1(): - X = np.sort(5*np.random.rand(40, 1)-2.5, axis=0) - y = (2*(np.cos(X)>2*np.random.rand(40, 1)-1)-1).ravel() - from sklearn.svm import SVR - -# cvals= np.logspace(-1,4,10) - svr_rbf = SVR(kernel='rbf', C=1e4, gamma=0.1, probability=True) - svr_lin = SVR(kernel='linear', C=1e4, probability=True) - svr_poly = SVR(kernel='poly', C=1e4, degree=2, probability=True) - y_rbf = svr_rbf.fit(X, y).predict(X) - y_lin = svr_lin.fit(X, y).predict(X) - y_poly = svr_poly.fit(X, y).predict(X) - - ############################################################################### - # look at the results - import pylab as pl - pl.scatter(X, .5*np.cos(X)+0.5, c='k', label='True model') - pl.hold('on') - pl.plot(X, y_rbf, c='g', label='RBF model') - pl.plot(X, y_lin, c='r', label='Linear model') - pl.plot(X, y_poly, c='b', label='Polynomial model') - pl.xlabel('data') - pl.ylabel('target') - pl.title('Support Vector Regression') - pl.legend() - pl.show() - -def test_doctstrings(): - #_test_dispersion_idx() - import doctest - doctest.testmod() - - -if __name__ == '__main__': - #test_reglogit2() - test_doctstrings() - +from __future__ import division +import warnings +from wafo.wafodata import PlotData +from wafo.misc import findextrema +from scipy import special +import numpy as np +from numpy import inf +from numpy import atleast_1d, nan, ndarray, sqrt, vstack, ones, where, zeros +# , reshape, repeat, product +from numpy import arange, floor, linspace, asarray +from time import gmtime, strftime + + +__all__ = [ + 'edf', 'edfcnd', 'reslife', 'dispersion_idx', 'decluster', 'findpot', + 'declustering_time', 'interexceedance_times', 'extremal_idx'] + +arr = asarray + + +def now(): + ''' + Return current date and time as a string + ''' + return strftime("%a, %d %b %Y %H:%M:%S", gmtime()) + + +def valarray(shape, value=nan, typecode=None): + """Return an array of all value. + """ + #out = reshape(repeat([value], product(shape, axis=0), axis=0), shape) + out = ones(shape, dtype=bool) * value + if typecode is not None: + out = out.astype(typecode) + if not isinstance(out, ndarray): + out = arr(out) + return out + + +def _cdff(self, x, dfn, dfd): + return special.fdtr(dfn, dfd, x) + + +def _cdft(x, df): + return special.stdtr(df, x) + + +def _invt(q, df): + return special.stdtrit(df, q) + + +def _cdfchi2(x, df): + return special.chdtr(df, x) + + +def _invchi2(q, df): + return special.chdtri(df, q) + + +def _cdfnorm(x): + return special.ndtr(x) + + +def _invnorm(q): + return special.ndtri(q) + + +def edf(x, method=2): + ''' + Returns Empirical Distribution Function (EDF). + + Parameters + ---------- + x : array-like + data vector + method : integer scalar + 1. Interpolation so that F(X_(k)) == (k-0.5)/n. + 2. Interpolation so that F(X_(k)) == k/(n+1). (default) + 3. The empirical distribution. F(X_(k)) = k/n + + Example + ------- + >>> import wafo.stats as ws + >>> x = np.linspace(0,6,200) + >>> R = ws.rayleigh.rvs(scale=2,size=100) + >>> F = ws.edf(R) + >>> h = F.plot() + + See also edf, pdfplot, cumtrapz + ''' + z = atleast_1d(x) + z.sort() + + N = len(z) + if method == 1: + Fz1 = arange(0.5, N) / N + elif method == 3: + Fz1 = arange(1, N + 1) / N + else: + Fz1 = arange(1, N + 1) / (N + 1) + + F = PlotData(Fz1, z, xlab='x', ylab='F(x)') + F.setplotter('step') + return F + + +def edfcnd(x, c=None, method=2): + ''' + Returns empirical Distribution Function CoNDitioned that X>=c (EDFCND). + + Parameters + ---------- + x : array-like + data vector + method : integer scalar + 1. Interpolation so that F(X_(k)) == (k-0.5)/n. + 2. Interpolation so that F(X_(k)) == k/(n+1). (default) + 3. The empirical distribution. F(X_(k)) = k/n + + Example + ------- + >>> import wafo.stats as ws + >>> x = np.linspace(0,6,200) + >>> R = ws.rayleigh.rvs(scale=2,size=100) + >>> Fc = ws.edfcnd(R, 1) + >>> hc = Fc.plot() + >>> F = ws.edf(R) + >>> h = F.plot() + + See also edf, pdfplot, cumtrapz + ''' + z = atleast_1d(x) + if c is None: + c = floor(min(z.min(), 0)) + + try: + F = edf(z[c <= z], method=method) + except: + ValueError('No data points above c=%d' % int(c)) + + if - inf < c: + F.labels.ylab = 'F(x| X>=%g)' % c + + return F + + +def reslife(data, u=None, umin=None, umax=None, nu=None, nmin=3, alpha=0.05, + plotflag=False): + ''' + Return Mean Residual Life, i.e., mean excesses vs thresholds + + Parameters + --------- + data : array_like + vector of data of length N. + u : array-like + threshold values (default linspace(umin, umax, nu)) + umin, umax : real scalars + Minimum and maximum threshold, respectively + (default min(data), max(data)). + nu : scalar integer + number of threshold values (default min(N-nmin,100)) + nmin : scalar integer + Minimum number of extremes to include. (Default 3). + alpha : real scalar + Confidence coefficient (default 0.05) + plotflag: bool + + Returns + ------- + mrl : PlotData object + Mean residual life values, i.e., mean excesses over thresholds, u. + + Notes + ----- + RESLIFE estimate mean excesses over thresholds. The purpose of MRL is + to determine the threshold where the upper tail of the data can be + approximated with the generalized Pareto distribution (GPD). The GPD is + appropriate for the tail, if the MRL is a linear function of the + threshold, u. Theoretically in the GPD model + + E(X-u0|X>u0) = s0/(1+k) + E(X-u |X>u) = s/(1+k) = (s0 -k*u)/(1+k) for u>u0 + + where k,s is the shape and scale parameter, respectively. + s0 = scale parameter for threshold u0>> import wafo + >>> R = wafo.stats.genpareto.rvs(0.1,2,2,size=100) + >>> mrl = reslife(R,nu=20) + >>> h = mrl.plot() + + See also + --------- + genpareto + fitgenparrange, disprsnidx + ''' + if u is None: + sd = np.sort(data) + n = len(data) + + nmin = max(nmin, 0) + if 2 * nmin > n: + warnings.warn('nmin possibly too large!') + + sdmax, sdmin = sd[-nmin], sd[0] + umax = sdmax if umax is None else min(umax, sdmax) + umin = sdmin if umin is None else max(umin, sdmin) + + if nu is None: + nu = min(n - nmin, 100) + + u = linspace(umin, umax, nu) + + nu = len(u) + + #mrl1 = valarray(nu) + #srl = valarray(nu) + #num = valarray(nu) + + mean_and_std = lambda data1: (data1.mean(), data1.std(), data1.size) + dat = arr(data) + tmp = arr([mean_and_std(dat[dat > tresh] - tresh) for tresh in u.tolist()]) + + mrl, srl, num = tmp.T + p = 1 - alpha + alpha2 = alpha / 2 + + # Approximate P% confidence interval + #%Za = -invnorm(alpha2); % known mean + Za = -_invt(alpha2, num - 1) # unknown mean + mrlu = mrl + Za * srl / sqrt(num) + mrll = mrl - Za * srl / sqrt(num) + + #options.CI = [mrll,mrlu]; + #options.numdata = num; + titleTxt = 'Mean residual life with %d%s CI' % (100 * p, '%') + res = PlotData(mrl, u, xlab='Threshold', + ylab='Mean Excess', title=titleTxt) + res.workspace = dict( + numdata=num, umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha) + res.children = [ + PlotData(vstack([mrll, mrlu]).T, u, xlab='Threshold', title=titleTxt)] + res.plot_args_children = [':r'] + if plotflag: + res.plot() + return res + + +def dispersion_idx( + data, t=None, u=None, umin=None, umax=None, nu=None, nmin=10, tb=1, + alpha=0.05, plotflag=False): + '''Return Dispersion Index vs threshold + + Parameters + ---------- + data, ti : array_like + data values and sampled times, respectively. + u : array-like + threshold values (default linspace(umin, umax, nu)) + umin, umax : real scalars + Minimum and maximum threshold, respectively + (default min(data), max(data)). + nu : scalar integer + number of threshold values (default min(N-nmin,100)) + nmin : scalar integer + Minimum number of extremes to include. (Default 10). + tb : Real scalar + Block period (same unit as the sampled times) (default 1) + alpha : real scalar + Confidence coefficient (default 0.05) + plotflag: bool + + Returns + ------- + DI : PlotData object + Dispersion index + b_u : real scalar + threshold where the number of exceedances in a fixed period (Tb) is + consistent with a Poisson process. + ok_u : array-like + all thresholds where the number of exceedances in a fixed period (Tb) + is consistent with a Poisson process. + + Notes + ------ + DISPRSNIDX estimate the Dispersion Index (DI) as function of threshold. + DI measures the homogenity of data and the purpose of DI is to determine + the threshold where the number of exceedances in a fixed period (Tb) is + consistent with a Poisson process. For a Poisson process the DI is one. + Thus the threshold should be so high that DI is not significantly + different from 1. + + The Poisson hypothesis is not rejected if the estimated DI is between: + + chi2(alpha/2, M-1)/(M-1)< DI < chi^2(1 - alpha/2, M-1 }/(M - 1) + + where M is the total number of fixed periods/blocks -generally + the total number of years in the sample. + + Example + ------- + >>> import wafo.data + >>> xn = wafo.data.sea() + >>> t, data = xn.T + >>> Ie = findpot(data,t,0,5); + >>> di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100) + >>> h = di.plot() # a threshold around 1 seems appropriate. + >>> round(u*100)/100 + 1.03 + + vline(u) + + See also + -------- + reslife, + fitgenparrange, + extremal_idx + + References + ---------- + Ribatet, M. A.,(2006), + A User's Guide to the POT Package (Version 1.0) + month = {August}, + url = {http://cran.r-project.org/} + + Cunnane, C. (1979) Note on the poisson assumption in + partial duration series model. Water Resource Research, 15\bold{(2)} + :489--494.} + ''' + + n = len(data) + if t is None: + ti = arange(n) + else: + ti = arr(t) - min(t) + + t1 = np.empty(ti.shape, dtype=int) + t1[:] = np.floor(ti / tb) + + if u is None: + sd = np.sort(data) + + nmin = max(nmin, 0) + if 2 * nmin > n: + warnings.warn('nmin possibly too large!') + + sdmax, sdmin = sd[-nmin], sd[0] + umax = sdmax if umax is None else min(umax, sdmax) + umin = sdmin if umin is None else max(umin, sdmin) + + if nu is None: + nu = min(n - nmin, 100) + + u = linspace(umin, umax, nu) + + nu = len(u) + + di = np.zeros(nu) + + d = arr(data) + + mint = int(min(t1)) # ; % mint should be 0. + maxt = int(max(t1)) + M = maxt - mint + 1 + occ = np.zeros(M) + + for ix, tresh in enumerate(u.tolist()): + excess = (d > tresh) + lambda_ = excess.sum() / M + for block in range(M): + occ[block] = sum(excess[t1 == block]) + + di[ix] = occ.var() / lambda_ + + p = 1 - alpha + + diLo = _invchi2(1 - alpha / 2, M - 1) / (M - 1) + diUp = _invchi2(alpha / 2, M - 1) / (M - 1) + + # Find appropriate threshold + k1, = np.where((diLo < di) & (di < diUp)) + if len(k1) > 0: + ok_u = u[k1] + b_di = (di[k1].mean() < di[k1]) + k = b_di.argmax() + b_u = ok_u[k] + else: + b_u = ok_u = None + + CItxt = '%d%s CI' % (100 * p, '%') + titleTxt = 'Dispersion Index plot' + + res = PlotData(di, u, title=titleTxt, + labx='Threshold', laby='Dispersion Index') + #'caption',CItxt); + res.workspace = dict(umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha) + res.children = [ + PlotData(vstack([diLo * ones(nu), diUp * ones(nu)]).T, u, + xlab='Threshold', title=CItxt)] + res.plot_args_children = ['--r'] + if plotflag: + res.plot(di) + return res, b_u, ok_u + + +def decluster(data, t=None, thresh=None, tmin=1): + ''' + Return declustered peaks over threshold values + + Parameters + ---------- + data, t : array-like + data-values and sampling-times, respectively. + thresh : real scalar + minimum threshold for levels in data. + tmin : real scalar + minimum distance to another peak [same unit as t] (default 1) + + Returns + ------- + ev, te : ndarray + extreme values and its corresponding sampling times, respectively, + i.e., all data > thresh which are at least tmin distance apart. + + Example + ------- + >>> import pylab + >>> import wafo.data + >>> from wafo.misc import findtc + >>> x = wafo.data.sea() + >>> t, data = x[:400,:].T + >>> itc, iv = findtc(data,0,'dw') + >>> ytc, ttc = data[itc], t[itc] + >>> ymin = 2*data.std() + >>> tmin = 10 # sec + >>> [ye, te] = decluster(ytc,ttc, ymin,tmin); + >>> h = pylab.plot(t,data,ttc,ytc,'ro',t,zeros(len(t)),':',te,ye,'k.') + + See also + -------- + fitgenpar, findpot, extremalidx + ''' + if t is None: + t = np.arange(len(data)) + i = findpot(data, t, thresh, tmin) + return data[i], t[i] + + +def findpot(data, t=None, thresh=None, tmin=1): + ''' + Retrun indices to Peaks over threshold values + + Parameters + ---------- + data, t : array-like + data-values and sampling-times, respectively. + thresh : real scalar + minimum threshold for levels in data. + tmin : real scalar + minimum distance to another peak [same unit as t] (default 1) + + Returns + ------- + Ie : ndarray + indices to extreme values, i.e., all data > tresh which are at least + tmin distance apart. + + Example + ------- + >>> import pylab + >>> import wafo.data + >>> from wafo.misc import findtc + >>> x = wafo.data.sea() + >>> t, data = x.T + >>> itc, iv = findtc(data,0,'dw') + >>> ytc, ttc = data[itc], t[itc] + >>> ymin = 2*data.std() + >>> tmin = 10 # sec + >>> I = findpot(data, t, ymin, tmin) + >>> yp, tp = data[I], t[I] + >>> Ie = findpot(yp, tp, ymin,tmin) + >>> ye, te = yp[Ie], tp[Ie] + >>> h = pylab.plot(t,data,ttc,ytc,'ro', + ... t,zeros(len(t)),':', + ... te, ye,'k.',tp,yp,'+') + + See also + -------- + fitgenpar, decluster, extremalidx + ''' + Data = arr(data) + if t is None: + ti = np.arange(len(Data)) + else: + ti = arr(t) + + Ie, = where(Data > thresh) + Ye = Data[Ie] + Te = ti[Ie] + if len(Ye) <= 1: + return Ie + + dT = np.diff(Te) + notSorted = np.any(dT < 0) + if notSorted: + I = np.argsort(Te) + Te = Te[I] + Ie = Ie[I] + Ye = Ye[I] + dT = np.diff(Te) + + isTooSmall = (dT <= tmin) + + if np.any(isTooSmall): + isTooClose = np.hstack( + (isTooSmall[0], isTooSmall[:-1] | isTooSmall[1:], isTooSmall[-1])) + + # Find opening (NO) and closing (NC) index for data beeing to close: + iy = findextrema(np.hstack([0, 0, isTooSmall, 0])) + + NO = iy[::2] - 1 + NC = iy[1::2] + + for no, nc in zip(NO, NC): + iz = slice(no, nc) + iOK = _find_ok_peaks(Ye[iz], Te[iz], tmin) + if len(iOK): + isTooClose[no + iOK] = 0 + # Remove data which is too close to other data. + if isTooClose.any(): + # len(tooClose)>0: + iOK, = where(1 - isTooClose) + Ie = Ie[iOK] + + return Ie + + +def _find_ok_peaks(Ye, Te, Tmin): + ''' + Return indices to the largest maxima that are at least Tmin + distance apart. + ''' + Ny = len(Ye) + + I = np.argsort(-Ye) # sort in descending order + + Te1 = Te[I] + oOrder = zeros(Ny, dtype=int) + oOrder[I] = range(Ny) # indices to the variables original location + + isTooClose = zeros(Ny, dtype=bool) + + pool = zeros((Ny, 2)) + T_range = np.hstack([-Tmin, Tmin]) + K = 0 + for i, ti in enumerate(Te1): + isTooClose[i] = np.any((pool[:K, 0] <= ti) & (ti <= pool[:K, 1])) + if not isTooClose[i]: + pool[K] = ti + T_range + K += 1 + + iOK, = where(1 - isTooClose[oOrder]) + return iOK + + +def declustering_time(t): + ''' + Returns minimum distance between clusters. + + Parameters + ---------- + t : array-like + sampling times for data. + + Returns + ------- + tc : real scalar + minimum distance between clusters. + + Example + ------- + >>> import wafo.data + >>> x = wafo.data.sea() + >>> t, data = x[:400,:].T + >>> Ie = findpot(data,t,0,5) + >>> tc = declustering_time(Ie) + >>> tc + 21 + ''' + t0 = arr(t) + nt = len(t0) + if nt < 2: + return arr([]) + ti = interexceedance_times(t0) + ei = extremal_idx(ti) + if ei == 1: + tc = ti.min() + else: + i = int(np.floor(nt * ei)) + sti = -np.sort(-ti) + tc = sti[min(i, nt - 2)] # % declustering time + return tc + + +def interexceedance_times(t): + ''' + Returns interexceedance times of data + + Parameters + ---------- + t : array-like + sampling times for data. + Returns + ------- + ti : ndarray + interexceedance times + + Example + ------- + >>> t = [1,2,5,10] + >>> interexceedance_times(t) + array([1, 3, 5]) + + ''' + return np.diff(np.sort(t)) + + +def extremal_idx(ti): + ''' + Returns Extremal Index measuring the dependence of data + + Parameters + ---------- + ti : array-like + interexceedance times for data. + + Returns + ------- + ei : real scalar + Extremal index. + + Notes + ----- + The Extremal Index (EI) is one if the data are independent and less than + one if there are some dependence. The extremal index can also be intepreted + as the reciprocal of the mean cluster size. + + Example + ------- + >>> import wafo.data + >>> x = wafo.data.sea() + >>> t, data = x[:400,:].T + >>> Ie = findpot(data,t,0,5); + >>> ti = interexceedance_times(Ie) + >>> ei = extremal_idx(ti) + >>> ei + 1 + + See also + -------- + reslife, fitgenparrange, disprsnidx, findpot, decluster + + + Reference + --------- + Christopher A. T. Ferro, Johan Segers (2003) + Inference for clusters of extreme values + Journal of the Royal Statistical society: Series B + (Statistical Methodology) 54 (2), 545-556 + doi:10.1111/1467-9868.00401 + ''' + t = arr(ti) + tmax = t.max() + if tmax <= 1: + ei = 0 + elif tmax <= 2: + ei = min(1, 2 * t.mean() ** 2 / ((t ** 2).mean())) + else: + ei = min(1, 2 * np.mean(t - 1) ** 2 / np.mean((t - 1) * (t - 2))) + return ei + + +def _logit(p): + return np.log(p) - np.log1p(-p) + + +def _logitinv(x): + return 1.0 / (np.exp(-x) + 1) + + +class RegLogit(object): + + ''' + REGLOGIT Fit ordinal logistic regression model. + + CALL model = reglogit (options) + + model = fitted model object with methods + .compare() : Compare small LOGIT object versus large one + .predict() : Predict from a fitted LOGIT object + .summary() : Display summary of fitted LOGIT object. + + y = vector of K ordered categories + x = column vectors of covariates + options = struct defining performance of REGLOGIT + .maxiter : maximum number of iterations. + .accuracy : accuracy in convergence. + .betastart : Start value for BETA (default 0) + .thetastart : Start value for THETA (default depends on Y) + .alpha : Confidence coefficent (default 0.05) + .verbose : 1 display summary info about fitted model + 2 display convergence info in each iteration + otherwise no action + .deletecolinear : If true delete colinear covarites (default) + + Methods + .predict : Predict from a fitted LOGIT object + .summary : Display summary of fitted LOGIT object. + .compare : Compare small LOGIT versus large one + + Suppose Y takes values in K ordered categories, and let + gamma_i (x) be the cumulative probability that Y + falls in one of the first i categories given the covariate + X. The ordinal logistic regression model is + + logit (mu_i (x)) = theta_i + beta' * x, i = 1...k-1 + + The number of ordinal categories, K, is taken to be the number + of distinct values of round (Y). If K equals 2, + Y is binary and the model is ordinary logistic regression. The + matrix X is assumed to have full column rank. + + Given Y only, theta = REGLOGIT(Y) fits the model with baseline logit odds + only. + + Example + y=[1 1 2 1 3 2 3 2 3 3]' + x = (1:10)' + b = reglogit(y,x) + b.display() % members and methods + b.get() % return members + b.summary() + [mu,plo,pup] = b.predict(); + plot(x,mu,'g',x,plo,'r:',x,pup,'r:') + + y2 = [zeros(5,1);ones(5,1)]; + x1 = [29,30,31,31,32,29,30,31,32,33]; + x2 = [62,83,74,88,68,41,44,21,50,33]; + X = [x1;x2].'; + b2 = reglogit(y2,X); + b2.summary(); + b21 = reglogit(y2,X(:,1)); + b21.compare(b2) + + See also regglm, reglm, regnonlm + ''' + + #% Original for MATLAB written by Gordon K Smyth , + #% U of Queensland, Australia, on Nov 19, 1990. Last revision Aug 3, + #% 1992. + # + #% Author: Gordon K Smyth , + #% Revised by: pab + #% -renamed from oridinal to reglogit + #% -added predict, summary and compare + #% Description: Ordinal logistic regression + # + #% Uses the auxiliary functions logistic_regression_derivatives and + #% logistic_regression_likelihood. + + def __init__(self, maxiter=500, accuracy=1e-6, alpha=0.05, + deletecolinear=True, verbose=False): + + self.maxiter = maxiter + self.accuracy = accuracy + self.alpha = alpha + self.deletecolinear = deletecolinear + self.verbose = False + self.family = None + self.link = None + self.numvar = None + self.numobs = None + self.numk = None + self.df = None + self.df_null = None + self.params = None + self.params_ci = None + self.params_cov = None + self.params_std = None + self.params_corr = None + self.params_tstat = None + self.params_pvalue = None + self.mu = None + self.eta = None + self.X = None + self.Y = None + self.theta = None + self.beta = None + self.residual = None + self.residual1d = None + self.deviance = None + self.deviance_null = None + self.d2L = None + self.dL = None + self.dispersionfit = None + self.dispersion = 1 + self.R2 = None + self.R2adj = None + self.numiter = None + self.converged = None + self.note = '' + self.date = now() + + def check_xy(self, y, X): + y = np.round(np.atleast_2d(y)) + my = y.shape[0] + if X is None: + X = np.zeros((my, 0)) + elif self.deletecolinear: + X = np.atleast_2d(X) + # Make sure X is full rank + s = np.linalg.svd(X)[1] + tol = max(X.shape) * np.finfo(s.max()).eps + ix = np.flatnonzero(s > tol) + iy = np.flatnonzero(s <= tol) + if len(ix): + X = X[:, ix] + txt = [' %d,' % i for i in iy] + #txt[-1] = ' %d' % iy[-1] + warnings.warn( + 'Covariate matrix is singular. Removing column(s):%s' % + txt) + mx = X.shape[0] + if (mx != my): + raise ValueError( + 'x and y must have the same number of observations') + return y, X + + def fit(self, y, X=None, theta0=None, beta0=None): + ''' + Member variables + .df : degrees of freedom for error. + .params : estimated model parameters + .params_ci : 100(1-alpha)% confidence interval for model parameters + .params_tstat : t statistics for model's estimated parameters. + .params_pvalue: p value for model's estimated parameters. + .params_std : standard errors for estimated parameters + .params_corr : correlation matrix for estimated parameters. + .mu : fitted values for the model. + .eta : linear predictor for the model. + .residual : residual for the model (Y-E(Y|X)). + .dispersnfit : The estimated error variance + .deviance : deviance for the model equal minus twice the + log-likelihood. + .d2L : Hessian matrix (double derivative of log-likelihood) + .dL : First derivative of loglikelihood w.r.t. THETA and BETA. + + ''' + self.family = 'multinomial' + self.link = 'logit' + y, X = self.check_xy(y, X) + + # initial calculations + tol = self.accuracy + incr = 10 + decr = 2 + ymin = y.min() + ymax = y.max() + yrange = ymax - ymin + z = (y * ones((1, yrange))) == ((y * 0 + 1) * np.arange(ymin, ymax)) + z1 = (y * ones((1, yrange))) == ( + (y * 0 + 1) * np.arange(ymin + 1, ymax + 1)) + z = z[:, np.flatnonzero(z.any(axis=0))] + z1 = z1[:, np.flatnonzero(z1.any(axis=0))] + [_mz, nz] = z.shape + [_mx, nx] = X.shape + [my, _ny] = y.shape + + g = (z.sum(axis=0).cumsum() / my).reshape(-1, 1) + theta00 = np.log(g / (1 - g)).ravel() + beta00 = np.zeros((nx,)) + # starting values + if theta0 is None: + theta0 = theta00 + + if beta0 is None: + beta0 = beta00 + + tb = np.hstack((theta0, beta0)) + + # likelihood and derivatives at starting values + [dev, dl, d2l] = self.loglike(tb, y, X, z, z1) + + epsilon = np.std(d2l) / 1000 + if np.any(beta0) or np.any(theta00 != theta0): + tb0 = np.vstack((theta00, beta00)) + nulldev = self.loglike(tb0, y, X, z, z1)[0] + else: + nulldev = dev + + # maximize likelihood using Levenberg modified Newton's method + for i in range(self.maxiter + 1): + + tbold = tb + devold = dev + tb = tbold - np.linalg.lstsq(d2l, dl)[0] + [dev, dl, d2l] = self.loglike(tb, y, X, z, z1) + if ((dev - devold) / np.dot(dl, tb - tbold) < 0): + epsilon = epsilon / decr + else: + while ((dev - devold) / np.dot(dl, tb - tbold) > 0): + epsilon = epsilon * incr + if (epsilon > 1e+15): + raise ValueError('epsilon too large') + + tb = tbold - \ + np.linalg.lstsq(d2l - epsilon * np.eye(d2l.shape), dl) + [dev, dl, d2l] = self.loglike(tb, y, X, z, z1) + print('epsilon %g' % epsilon) + # end %while + # end else + #[dl, d2l] = logistic_regression_derivatives (X, z, z1, g, g1, p); + if (self.verbose > 1): + + print('Iter: %d, Deviance: %8.6f', iter, dev) + print('First derivative') + print(dl) + print('Eigenvalues of second derivative') + print(np.linalg.eig(d2l)[0].T) + # end + # end + stop = np.abs( + np.dot(dl, np.linalg.lstsq(d2l, dl)[0]) / len(dl)) <= tol + if stop: + break + # end %while + + #% tidy up output + + theta = tb[:nz, ] + beta = tb[nz:(nz + nx)] + pcov = np.linalg.pinv(-d2l) + se = sqrt(np.diag(pcov)) + + if (nx > 0): + eta = ((X * beta) * ones((1, nz))) + ((y * 0 + 1) * theta) + else: + eta = (y * 0 + 1) * theta + # end + gammai = np.diff( + np.hstack(((y * 0), _logitinv(eta), (y * 0 + 1))), n=1, axis=1) + k0 = min(y) + mu = (k0 - 1) + np.dot(gammai, np.arange(1, nz + 2)).reshape(-1, 1) + r = np.corrcoef(np.hstack((y, mu)).T) + R2 = r[0, 1] ** 2 + # coefficient of determination + # adjusted coefficient of determination + R2adj = max(1 - (1 - R2) * (my - 1) / (my - nx - nz - 1), 0) + + res = y - mu + + if nz == 1: + self.family = 'binomial' + else: + self.family = 'multinomial' + + self.link = 'logit' + + self.numvar = nx + nz + self.numobs = my + self.numk = nz + 1 + self.df = max(my - nx - nz, 0) + self.df_null = my - nz + # nulldf; nulldf = n - nz + self.params = tb[:(nz + nx)] + self.params_ci = 1 + self.params_std = se + self.params_cov = pcov + self.params_tstat = (self.params / self.params_std) + # % options.estdispersn %dispersion_parameter=='mean_deviance' + if False: + self.params_pvalue = 2. * _cdft(-abs(self.params_tstat), self.df) + bcrit = -se * _invt(self.alpha / 2, self.df) + else: + self.params_pvalue = 2. * _cdfnorm(-abs(self.params_tstat)) + bcrit = -se * _invnorm(self.alpha / 2) + # end + self.params_ci = np.vstack((self.params + bcrit, self.params - bcrit)) + + self.mu = gammai + self.eta = _logit(gammai) + self.X = X + [dev, dl, d2l, p] = self.loglike(tb, y, X, z, z1, numout=4) + self.theta = theta + self.beta = beta + self.gamma = gammai + self.residual = res.T + self.residualD = np.sign(self.residual) * sqrt(-2 * np.log(p)) + self.deviance = dev + self.deviance_null = nulldev + self.d2L = d2l + self.dL = dl.T + self.dispersionfit = 1 + self.dispersion = 1 + self.R2 = R2 + self.R2adj = R2adj + self.numiter = i + self.converged = i < self.maxiter + self.note = '' + self.date = now() + + if (self.verbose): + self.summary() + + def compare(self, object2): + ''' Compare small LOGIT versus large one + + CALL [pvalue] = compare(object2) + + The standard hypothesis test of a larger linear regression + model against a smaller one. The standard Chi2-test is used. + The output is the p-value, the residuals from the smaller + model, and the residuals from the larger model. + + See also fitls + ''' + + try: + if self.numvar > object2.numvar: + devL = self.deviance + nL = self.numvar + dfL = self.df + Al = self.X + disprsn = self.dispersionfit + devs = object2.deviance + ns = object2.numvar + dfs = object2.df + As = object2.X + else: + devL = object2.deviance + nL = object2.numvar + dfL = object2.df + Al = object2.X + disprsn = object2.dispersionfit + devs = self.deviance + ns = self.numvar + dfs = self.df + As = self.X + # end + + if (((As - np.dot(Al * np.linalg.lstsq(Al, As))) > 500 * np.finfo(float).eps).any() or + object2.family != self.family or object2.link != self.link): + warnings.warn('Small model not included in large model,' + + ' result is rubbish!') + + except: + raise ValueError('Apparently not a valid regression object') + + pmq = np.abs(nL - ns) + print(' ') + print(' Analysis of Deviance') + if False: # options.estdispersn + localstat = abs(devL - devs) / disprsn / pmq +# localpvalue = 1-cdff(localstat, pmq, dfL) +# print('Model DF Residual deviance F-stat Pr(>F)') + else: + localstat = abs(devL - devs) / disprsn + localpvalue = 1 - _cdfchi2(localstat, pmq) + print('Model DF Residual deviance Chi2-stat ' + + ' Pr(>Chi2)') + # end + + print('Small %d %12.4f %12.4f %12.4f' % + (dfs, devs, localstat, localpvalue)) + print('Full %d %12.4f' % (dfL, devL)) + print(' ') + + return localpvalue + + def anode(self): + print(' ') + print(' Analysis of Deviance') + if False: # %options.estdispersn + localstat = abs(self.deviance_null - self.deviance) / \ + self.dispersionfit / (self.numvar - 1) + localpvalue = 1 - _cdff(localstat, self.numvar - 1, self.df) + print( + 'Model DF Residual deviance F-stat Pr(>F)') + else: + localstat = abs( + self.deviance_null - self.deviance) / self.dispersionfit + localpvalue = 1 - _cdfchi2(localstat, self.numvar - 1) + print('Model DF Residual deviance Chi2-stat' + + ' Pr(>Chi2)') + # end + + print('Null %d %12.4f %12.4f %12.4f' % + (self.df_null, self.deviance_null, localstat, localpvalue)) + print('Full %d %12.4f' % (self.df, self.deviance)) + print(' ') + + print(' R2 = %2.4f, R2adj = %2.4f' % (self.R2, self.R2adj)) + print(' ') + return localpvalue + + def summary(self): + txtlink = self.link + + print('Call:') + print('reglogit(formula = %s(Pr(grp(y)<=i)) ~ theta_i+beta*x, family = %s)' % + (txtlink, self.family)) + print(' ') + print('Deviance Residuals:') + m, q1, me, q3, M = np.percentile( + self.residualD, q=[0, 25, 50, 75, 100]) + print(' Min 1Q Median 3Q Max ') + print('%2.4f %2.4f %2.4f %2.4f %2.4f' % + (m, q1, me, q3, M)) + print(' ') + print(' Coefficients:') + if False: # %options.estdispersn + print( + ' Estimate Std. Error t value Pr(>|t|)') + else: + print( + ' Estimate Std. Error z value Pr(>|z|)') + # end + e, s, z, p = (self.params, self.params_std, self.params_tstat, + self.params_pvalue) + for i in range(self.numk): + print( + 'theta_%d %2.4f %2.4f %2.4f %2.4f' % + (i, e[i], s[i], z[i], p[i])) + + for i in range(self.numk, self.numvar): + print( + ' beta_%d %2.4f %2.4f %2.4f %2.4f\n' % + (i - self.numk, e[i], s[i], z[i], p[i])) + + print(' ') + print('(Dispersion parameter for %s family taken to be %2.2f)' % + (self.family, self.dispersionfit)) + print(' ') + if True: # %options.constant + print(' Null deviance: %2.4f on %d degrees of freedom' % + (self.deviance_null, self.df_null)) + # end + print('Residual deviance: %2.4f on %d degrees of freedom' % + (self.deviance, self.df)) + + self.anode() + + #end % summary + + def predict(self, Xnew=None, alpha=0.05, fulloutput=False): + '''LOGIT/PREDICT Predict from a fitted LOGIT object + + CALL [y,ylo,yup] = predict(Xnew,options) + + y = predicted value + ylo,yup = 100(1-alpha)% confidence interval for y + + Xnew = new covariate + options = options struct defining the calculation + .alpha : confidence coefficient (default 0.05) + .size : size if binomial family (default 1). + ''' + + [_mx, nx] = self.X.shape + if Xnew is None: + Xnew = self.X + else: + Xnew = np.atleast_2d(Xnew) + notnans = np.flatnonzero(1 - (1 - np.isfinite(Xnew)).any(axis=1)) + Xnew = Xnew[notnans, :] + + [n, p] = Xnew.shape + + if p != nx: + raise ValueError('Number of covariates must match the number' + + ' of regression coefficients') + + nz = self.numk - 1 + one = ones((n, 1)) + if (nx > 0): + eta = np.dot(Xnew, self.beta).reshape(-1, 1) + self.theta + else: + eta = one * self.theta + # end + y = np.diff( + np.hstack((zeros((n, 1)), _logitinv(eta), one)), n=1, axis=1) + if fulloutput: + eps = np.finfo(float).eps + pcov = self.params_cov + if (nx > 0): + np1 = pcov.shape[0] + + [U, S, V] = np.linalg.svd(pcov, 0) + # %squareroot of pcov + R = np.dot(U, np.dot(np.diag(sqrt(S)), V)) + ib = np.r_[0, nz:np1] + + #% Var(eta_i) = var(theta_i+Xnew*b) + vareta = zeros((n, nz)) + u = np.hstack((one, Xnew)) + for i in range(nz): + ib[0] = i + vareta[:, i] = np.maximum( + ((np.dot(u, R[ib][:, ib])) ** 2).sum(axis=1), eps) + # end + else: + vareta = np.diag(pcov) + # end + crit = -_invnorm(alpha / 2) + + ecrit = crit * sqrt(vareta) + mulo = _logitinv(eta - ecrit) + muup = _logitinv(eta + ecrit) + ylo1 = np.diff(np.hstack((zeros((n, 1)), mulo, one)), n=1, axis=1) + yup1 = np.diff(np.hstack((zeros((n, 1)), muup, one)), n=1, axis=1) + + ylo = np.minimum(ylo1, yup1) + yup = np.maximum(ylo1, yup1) + + for i in range(1, nz): # = 2:self.numk-1 + yup[:, i] = np.vstack( + (yup[:, i], muup[:, i] - mulo[:, i - 1])).max(axis=0) + # end + return y, ylo, yup + return y + + def loglike(self, beta, y, x, z, z1, numout=3): + ''' + [dev, dl, d2l, p] = loglike( y ,x,beta,z,z1) + Calculates likelihood for the ordinal logistic regression model. + ''' + # Author: Gordon K. Smyth + zx = np.hstack((z, x)) + z1x = np.hstack((z1, x)) + g = _logitinv(np.dot(zx, beta)).reshape((-1, 1)) + g1 = _logitinv(np.dot(z1x, beta)).reshape((-1, 1)) + g = np.maximum(y == y.max(), g) + g1 = np.minimum(y > y.min(), g1) + + p = g - g1 + dev = -2 * np.log(p).sum() + + '''[dl, d2l] = derivatives of loglike(beta, y, x, z, z1) + % Called by logistic_regression. Calculates derivates of the + % log-likelihood for ordinal logistic regression model. + ''' + # Author: Gordon K. Smyth + # Description: Derivates of log-likelihood in logistic regression + + # first derivative + v = g * (1 - g) / p + v1 = g1 * (1 - g1) / p + dlogp = np.hstack((((v * z) - (v1 * z1)), ((v - v1) * x))) + dl = np.sum(dlogp, axis=0) + + # second derivative + w = v * (1 - 2 * g) + w1 = v1 * (1 - 2 * g1) + d2l = np.dot(zx.T, (w * zx)) - np.dot( + z1x.T, (w1 * z1x)) - np.dot(dlogp.T, dlogp) + + if numout == 4: + return dev, dl, d2l, p + else: + return dev, dl, d2l + #end %function + + +def _test_dispersion_idx(): + import wafo.data + xn = wafo.data.sea() + t, data = xn.T + Ie = findpot(data, t, 0, 5) + di, _u, _ok_u = dispersion_idx(data[Ie], t[Ie], tb=100) + di.plot() # a threshold around 1 seems appropriate. + di.show() + pass + + +def _test_findpot(): + import pylab + import wafo.data + from wafo.misc import findtc + x = wafo.data.sea() + t, data = x[:, :].T + itc, _iv = findtc(data, 0, 'dw') + ytc, ttc = data[itc], t[itc] + ymin = 2 * data.std() + tmin = 10 # sec + I = findpot(data, t, ymin, tmin) + yp, tp = data[I], t[I] + Ie = findpot(yp, tp, ymin, tmin) + ye, te = yp[Ie], tp[Ie] + pylab.plot(t, data, ttc, ytc, 'ro', t, + zeros(len(t)), ':', te, ye, 'kx', tp, yp, '+') + pylab.show() + pass + + +def _test_reslife(): + import wafo + R = wafo.stats.genpareto.rvs(0.1, 2, 2, size=100) + mrl = reslife(R, nu=20) + mrl.plot() + + +def test_reglogit(): + y = np.array([1, 1, 2, 1, 3, 2, 3, 2, 3, 3]).reshape(-1, 1) + x = np.arange(1, 11).reshape(-1, 1) + b = RegLogit() + b.fit(y, x) + # b.display() #% members and methods + + b.summary() + [mu, plo, pup] = b.predict(fulloutput=True) # @UnusedVariable + pass + # plot(x,mu,'g',x,plo,'r:',x,pup,'r:') + + +def test_reglogit2(): + n = 40 + x = np.sort(5 * np.random.rand(n, 1) - 2.5, axis=0) + y = (np.cos(x) > 2 * np.random.rand(n, 1) - 1) + b = RegLogit() + b.fit(y, x) + # b.display() #% members and methods + b.summary() + [mu, plo, pup] = b.predict(fulloutput=True) + import matplotlib.pyplot as pl + pl.plot(x, mu, 'g', x, plo, 'r:', x, pup, 'r:') + pl.show() + + +def test_sklearn0(): + from sklearn.linear_model import LogisticRegression + from sklearn import datasets # @UnusedImport + + # FIXME: the iris dataset has only 4 features! +# iris = datasets.load_iris() +# X = iris.data +# y = iris.target + + X = np.sort(5 * np.random.rand(40, 1) - 2.5, axis=0) + y = (2 * (np.cos(X) > 2 * np.random.rand(40, 1) - 1) - 1).ravel() + + score = [] + # Set regularization parameter + cvals = np.logspace(-1, 1, 5) + for C in cvals: + clf_LR = LogisticRegression(C=C, penalty='l2') + clf_LR.fit(X, y) + score.append(clf_LR.score(X, y)) + + #plot(cvals, score) + + +def test_sklearn(): + X = np.sort(5 * np.random.rand(40, 1) - 2.5, axis=0) + y = (2 * (np.cos(X) > 2 * np.random.rand(40, 1) - 1) - 1).ravel() + from sklearn.svm import SVR + + # + # look at the results + import pylab as pl + pl.scatter(X, .5 * np.cos(X) + 0.5, c='k', label='True model') + pl.hold('on') + cvals = np.logspace(-1, 3, 20) + score = [] + for c in cvals: + svr_rbf = SVR(kernel='rbf', C=c, gamma=0.1, probability=True) + svrf = svr_rbf.fit(X, y) + y_rbf = svrf.predict(X) + score.append(svrf.score(X, y)) + pl.plot(X, y_rbf, label='RBF model c=%g' % c) + pl.xlabel('data') + pl.ylabel('target') + pl.title('Support Vector Regression') + pl.legend() + pl.show() + + +def test_sklearn1(): + X = np.sort(5 * np.random.rand(40, 1) - 2.5, axis=0) + y = (2 * (np.cos(X) > 2 * np.random.rand(40, 1) - 1) - 1).ravel() + from sklearn.svm import SVR + +# cvals= np.logspace(-1,4,10) + svr_rbf = SVR(kernel='rbf', C=1e4, gamma=0.1, probability=True) + svr_lin = SVR(kernel='linear', C=1e4, probability=True) + svr_poly = SVR(kernel='poly', C=1e4, degree=2, probability=True) + y_rbf = svr_rbf.fit(X, y).predict(X) + y_lin = svr_lin.fit(X, y).predict(X) + y_poly = svr_poly.fit(X, y).predict(X) + + # + # look at the results + import pylab as pl + pl.scatter(X, .5 * np.cos(X) + 0.5, c='k', label='True model') + pl.hold('on') + pl.plot(X, y_rbf, c='g', label='RBF model') + pl.plot(X, y_lin, c='r', label='Linear model') + pl.plot(X, y_poly, c='b', label='Polynomial model') + pl.xlabel('data') + pl.ylabel('target') + pl.title('Support Vector Regression') + pl.legend() + pl.show() + + +def test_doctstrings(): + #_test_dispersion_idx() + import doctest + doctest.testmod() + + +if __name__ == '__main__': + # test_reglogit2() + test_doctstrings() diff --git a/pywafo/src/wafo/stats/distributions.py b/pywafo/src/wafo/stats/distributions.py index 03e278f..7e76ca9 100644 --- a/pywafo/src/wafo/stats/distributions.py +++ b/pywafo/src/wafo/stats/distributions.py @@ -1,9070 +1,13 @@ -# -# Author: Travis Oliphant 2002-2011 with contributions from -# SciPy Developers 2004-2011 -# -from __future__ import division, print_function, absolute_import - -import math -import sys -import warnings - -#from scipy.lib. -from wafo.stats.six import callable, string_types, get_method_function -from wafo.stats.six import exec_ - -from scipy.misc import comb, derivative -from scipy.misc.doccer import inherit_docstring_from -from scipy import special -from scipy import optimize -from scipy import integrate -from scipy.special import gammaln as gamln - -import keyword -import re -import inspect -from numpy import all, where, arange, putmask, \ - ravel, take, ones, sum, shape, product, reshape, \ - zeros, floor, logical_and, log, sqrt, exp, arctanh, tan, sin, arcsin, \ - arctan, tanh, ndarray, cos, cosh, sinh, newaxis, log1p, expm1 -from numpy import atleast_1d, polyval, ceil, place, extract, \ - any, argsort, argmax, vectorize, r_, asarray, nan, inf, pi, isinf, \ - NINF, empty -import numpy -import numpy as np -import numpy.random as mtrand -from numpy import flatnonzero as nonzero - -_log1p = log1p - -from wafo.stats.estimation import FitDistribution - -try: - from scipy.stats.distributions import vonmises_cython -except: - vonmises_cython = None -try: - from scipy.stats._tukeylambda_stats import tukeylambda_variance as _tlvar, \ - tukeylambda_kurtosis as _tlkurt -except: - _tlvar = _tlkurt = None - -__all__ = [ - 'rv_continuous', - 'ksone', 'kstwobign', 'norm', 'alpha', 'anglit', 'arcsine', - 'beta', 'betaprime', 'bradford', 'burr', 'fisk', 'cauchy', - 'chi', 'chi2', 'cosine', 'dgamma', 'dweibull', 'erlang', - 'expon', 'exponweib', 'exponpow', 'fatiguelife', 'foldcauchy', - 'f', 'foldnorm', 'frechet_r', 'weibull_min', 'frechet_l', - 'weibull_max', 'genlogistic', 'genpareto', 'genexpon', 'genextreme', - 'gamma', 'gengamma', 'genhalflogistic', 'gompertz', 'gumbel_r', - 'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant', - 'gausshyper', 'invgamma', 'invgauss', 'invweibull', - 'johnsonsb', 'johnsonsu', 'laplace', 'levy', 'levy_l', - 'levy_stable', 'logistic', 'loggamma', 'loglaplace', 'lognorm', - 'gilbrat', 'maxwell', 'mielke', 'nakagami', 'ncx2', 'ncf', 't', - 'nct', 'pareto', 'lomax', 'pearson3', 'powerlaw', 'powerlognorm', - 'powernorm', 'rdist', 'rayleigh', 'reciprocal', 'rice', - 'truncrayleigh', - 'recipinvgauss', 'semicircular', 'triang', 'truncexpon', - 'truncnorm', 'tukeylambda', 'uniform', 'vonmises', 'wald', - 'wrapcauchy', 'entropy', 'rv_discrete', 'binom', 'bernoulli', - 'nbinom', 'geom', 'hypergeom', 'logser', 'poisson', 'planck', - 'boltzmann', 'randint', 'zipf', 'dlaplace', 'skellam' - ] - -floatinfo = numpy.finfo(float) -eps = numpy.finfo(float).eps - -gam = special.gamma -random = mtrand.random_sample - -import types -from scipy.misc import doccer - -try: - from new import instancemethod -except ImportError: - # Python 3 - def instancemethod(func, obj, cls): - return types.MethodType(func, obj) - -def log1p(x): - '''avoids warnings for x==-1''' - mx = where(x==-1, 0, x) - return where(x==-1, -inf, _log1p(mx)) - -def xlogy(x,y): - cond = (x == 0.0) & (y == 0) - logy = where(cond, 1.0, log(y)) - return where(cond, 0.0, x * logy) - -def xlog1py(x,y): - cond = (x == 0.0) & (y == -1) - log1py = where(cond, 1.0, log1p(y)) - return where(cond, 0.0, x * log1py) - -special.xlogy = xlogy -special.xlog1py = xlog1py - -# These are the docstring parts used for substitution in specific -# distribution docstrings - -docheaders = {'methods':"""\nMethods\n-------\n""", - 'parameters':"""\nParameters\n---------\n""", - 'notes':"""\nNotes\n-----\n""", - 'examples':"""\nExamples\n--------\n"""} - -_doc_rvs = \ -"""rvs(%(shapes)s, loc=0, scale=1, size=1) - Random variates. -""" -_doc_pdf = \ -"""pdf(x, %(shapes)s, loc=0, scale=1) - Probability density function. -""" -_doc_logpdf = \ -"""logpdf(x, %(shapes)s, loc=0, scale=1) - Log of the probability density function. -""" -_doc_pmf = \ -"""pmf(x, %(shapes)s, loc=0, scale=1) - Probability mass function. -""" -_doc_logpmf = \ -"""logpmf(x, %(shapes)s, loc=0, scale=1) - Log of the probability mass function. -""" -_doc_cdf = \ -"""cdf(x, %(shapes)s, loc=0, scale=1) - Cumulative density function. -""" -_doc_logcdf = \ -"""logcdf(x, %(shapes)s, loc=0, scale=1) - Log of the cumulative density function. -""" -_doc_sf = \ -"""sf(x, %(shapes)s, loc=0, scale=1) - Survival function (1-cdf --- sometimes more accurate). -""" -_doc_logsf = \ -"""logsf(x, %(shapes)s, loc=0, scale=1) - Log of the survival function. -""" -_doc_ppf = \ -"""ppf(q, %(shapes)s, loc=0, scale=1) - Percent point function (inverse of cdf --- percentiles). -""" -_doc_isf = \ -"""isf(q, %(shapes)s, loc=0, scale=1) - Inverse survival function (inverse of sf). -""" -_doc_moment = \ -"""moment(n, %(shapes)s, loc=0, scale=1) - Non-central moment of order n -""" -_doc_stats = \ -"""stats(%(shapes)s, loc=0, scale=1, moments='mv') - Mean('m'), variance('v'), skew('s'), and/or kurtosis('k'). -""" -_doc_entropy = \ -"""entropy(%(shapes)s, loc=0, scale=1) - (Differential) entropy of the RV. -""" -_doc_fit = \ -"""fit(data, %(shapes)s, loc=0, scale=1) - Parameter estimates for generic data. -""" -_doc_expect = \ -"""expect(func, %(shapes)s, loc=0, scale=1, lb=None, ub=None, conditional=False, **kwds) - Expected value of a function (of one argument) with respect to the distribution. -""" -_doc_expect_discrete = \ -"""expect(func, %(shapes)s, loc=0, lb=None, ub=None, conditional=False) - Expected value of a function (of one argument) with respect to the distribution. -""" -_doc_median = \ -"""median(%(shapes)s, loc=0, scale=1) - Median of the distribution. -""" -_doc_mean = \ -"""mean(%(shapes)s, loc=0, scale=1) - Mean of the distribution. -""" -_doc_var = \ -"""var(%(shapes)s, loc=0, scale=1) - Variance of the distribution. -""" -_doc_std = \ -"""std(%(shapes)s, loc=0, scale=1) - Standard deviation of the distribution. -""" -_doc_interval = \ -"""interval(alpha, %(shapes)s, loc=0, scale=1) - Endpoints of the range that contains alpha percent of the distribution -""" -_doc_allmethods = ''.join([docheaders['methods'], _doc_rvs, _doc_pdf, - _doc_logpdf, _doc_cdf, _doc_logcdf, _doc_sf, - _doc_logsf, _doc_ppf, _doc_isf, _doc_moment, - _doc_stats, _doc_entropy, _doc_fit, - _doc_expect, _doc_median, - _doc_mean, _doc_var, _doc_std, _doc_interval]) - -# Note that the two lines for %(shapes) are searched for and replaced in -# rv_continuous and rv_discrete - update there if the exact string changes -_doc_default_callparams = \ -""" -Parameters ----------- -x : array_like - quantiles -q : array_like - lower or upper tail probability -%(shapes)s : array_like - shape parameters -loc : array_like, optional - location parameter (default=0) -scale : array_like, optional - scale parameter (default=1) -size : int or tuple of ints, optional - shape of random variates (default computed from input arguments ) -moments : str, optional - composed of letters ['mvsk'] specifying which moments to compute where - 'm' = mean, 'v' = variance, 's' = (Fisher's) skew and - 'k' = (Fisher's) kurtosis. (default='mv') -""" -_doc_default_longsummary = \ -"""Continuous random variables are defined from a standard form and may -require some shape parameters to complete its specification. Any -optional keyword parameters can be passed to the methods of the RV -object as given below: -""" -_doc_default_frozen_note = \ -""" -Alternatively, the object may be called (as a function) to fix the shape, -location, and scale parameters returning a "frozen" continuous RV object: - -rv = %(name)s(%(shapes)s, loc=0, scale=1) - - Frozen RV object with the same methods but holding the given shape, - location, and scale fixed. -""" -_doc_default_example = \ -"""Examples --------- ->>> import matplotlib.pyplot as plt ->>> from wafo.stats import %(name)s ->>> numargs = %(name)s.numargs ->>> [ %(shapes)s ] = [0.9,] * numargs ->>> rv = %(name)s(%(shapes)s) - -Display frozen pdf - ->>> x = np.linspace(0, np.minimum(rv.dist.b, 3)) ->>> h = plt.plot(x, rv.pdf(x)) - -Here, ``rv.dist.b`` is the right endpoint of the support of ``rv.dist``. - -Check accuracy of cdf and ppf - ->>> prb = %(name)s.cdf(x, %(shapes)s) ->>> h = plt.semilogy(np.abs(x - %(name)s.ppf(prb, %(shapes)s)) + 1e-20) - -Random number generation - ->>> R = %(name)s.rvs(%(shapes)s, size=100) - -Compare ML and MPS method ->>> phat = %(name)s.fit2(R, method='ml'); ->>> phat.plotfitsummary(); plt.figure(plt.gcf().number+1) ->>> phat2 = %(name)s.fit2(R, method='mps') ->>> phat2.plotfitsummary(); plt.figure(plt.gcf().number+1) - -Fix loc=0 and estimate shapes and scale ->>> phat3 = %(name)s.fit2(R, scale=1, floc=0, method='mps') ->>> phat3.plotfitsummary(); plt.figure(plt.gcf().number+1) - -Accurate confidence interval with profile loglikelihood ->>> lp = phat3.profile() ->>> lp.plot() ->>> pci = lp.get_bounds() - -""" - -_doc_default = ''.join([_doc_default_longsummary, - _doc_allmethods, - _doc_default_callparams, - _doc_default_frozen_note, - _doc_default_example]) - -_doc_default_before_notes = ''.join([_doc_default_longsummary, - _doc_allmethods, - _doc_default_callparams, - _doc_default_frozen_note]) - -docdict = {'rvs':_doc_rvs, - 'pdf':_doc_pdf, - 'logpdf':_doc_logpdf, - 'cdf':_doc_cdf, - 'logcdf':_doc_logcdf, - 'sf':_doc_sf, - 'logsf':_doc_logsf, - 'ppf':_doc_ppf, - 'isf':_doc_isf, - 'stats':_doc_stats, - 'entropy':_doc_entropy, - 'fit':_doc_fit, - 'moment':_doc_moment, - 'expect':_doc_expect, - 'interval':_doc_interval, - 'mean':_doc_mean, - 'std':_doc_std, - 'var':_doc_var, - 'median':_doc_median, - 'allmethods':_doc_allmethods, - 'callparams':_doc_default_callparams, - 'longsummary':_doc_default_longsummary, - 'frozennote':_doc_default_frozen_note, - 'example':_doc_default_example, - 'default':_doc_default, - 'before_notes':_doc_default_before_notes} - -# Reuse common content between continous and discrete docs, change some -# minor bits. -docdict_discrete = docdict.copy() - -docdict_discrete['pmf'] = _doc_pmf -docdict_discrete['logpmf'] = _doc_logpmf -docdict_discrete['expect'] = _doc_expect_discrete -_doc_disc_methods = ['rvs', 'pmf', 'logpmf', 'cdf', 'logcdf', 'sf', 'logsf', - 'ppf', 'isf', 'stats', 'entropy', 'expect', 'median', - 'mean', 'var', 'std', 'interval', - 'fit'] -for obj in _doc_disc_methods: - docdict_discrete[obj] = docdict_discrete[obj].replace(', scale=1', '') -docdict_discrete.pop('pdf') -docdict_discrete.pop('logpdf') - -_doc_allmethods = ''.join([docdict_discrete[obj] for obj in - _doc_disc_methods]) -docdict_discrete['allmethods'] = docheaders['methods'] + _doc_allmethods - -docdict_discrete['longsummary'] = _doc_default_longsummary.replace( - 'Continuous', 'Discrete') -_doc_default_frozen_note = \ -""" -Alternatively, the object may be called (as a function) to fix the shape and -location parameters returning a "frozen" discrete RV object: - -rv = %(name)s(%(shapes)s, loc=0) - - Frozen RV object with the same methods but holding the given shape and - location fixed. -""" -docdict_discrete['frozennote'] = _doc_default_frozen_note - -_doc_default_discrete_example = \ -"""Examples --------- ->>> from scipy.stats import %(name)s ->>> [ %(shapes)s ] = [] ->>> rv = %(name)s(%(shapes)s) - -Display frozen pmf - ->>> x = np.arange(0, np.minimum(rv.dist.b, 3)) ->>> h = plt.vlines(x, 0, rv.pmf(x), lw=2) - -Here, ``rv.dist.b`` is the right endpoint of the support of ``rv.dist``. - -Check accuracy of cdf and ppf - ->>> prb = %(name)s.cdf(x, %(shapes)s) ->>> h = plt.semilogy(np.abs(x - %(name)s.ppf(prb, %(shapes)s)) + 1e-20) - -Random number generation - ->>> R = %(name)s.rvs(%(shapes)s, size=100) - -""" -docdict_discrete['example'] = _doc_default_discrete_example - -_doc_default_before_notes = ''.join([docdict_discrete['longsummary'], - docdict_discrete['allmethods'], - docdict_discrete['callparams'], - docdict_discrete['frozennote']]) -docdict_discrete['before_notes'] = _doc_default_before_notes - -_doc_default_disc = ''.join([docdict_discrete['longsummary'], - docdict_discrete['allmethods'], - docdict_discrete['frozennote'], - docdict_discrete['example']]) -docdict_discrete['default'] = _doc_default_disc - - -# clean up all the separate docstring elements, we do not need them anymore -for obj in [s for s in dir() if s.startswith('_doc_')]: - exec('del ' + obj) -del obj -try: - del s -except NameError: - # in Python 3, loop variables are not visible after the loop - pass - - -def _moment(data, n, mu=None): - if mu is None: - mu = data.mean() - return ((data - mu)**n).mean() - - -def _moment_from_stats(n, mu, mu2, g1, g2, moment_func, args): - if (n == 0): - return 1.0 - elif (n == 1): - if mu is None: - val = moment_func(1,*args) - else: - val = mu - elif (n == 2): - if mu2 is None or mu is None: - val = moment_func(2,*args) - else: - val = mu2 + mu*mu - elif (n == 3): - if g1 is None or mu2 is None or mu is None: - val = moment_func(3,*args) - else: - mu3 = g1 * np.power(mu2, 1.5) # 3rd central moment - val = mu3+3*mu*mu2+mu*mu*mu # 3rd non-central moment - elif (n == 4): - if g1 is None or g2 is None or mu2 is None or mu is None: - val = moment_func(4,*args) - else: - mu4 = (g2+3.0)*(mu2**2.0) # 4th central moment - mu3 = g1*np.power(mu2, 1.5) # 3rd central moment - val = mu4+4*mu*mu3+6*mu*mu*mu2+mu*mu*mu*mu - else: - val = moment_func(n, *args) - - return val - - -def _skew(data): - """ - skew is third central moment / variance**(1.5) - """ - data = np.ravel(data) - mu = data.mean() - m2 = ((data - mu)**2).mean() - m3 = ((data - mu)**3).mean() - return m3 / np.power(m2, 1.5) - - -def _kurtosis(data): - """ - kurtosis is fourth central moment / variance**2 - 3 - """ - data = np.ravel(data) - mu = data.mean() - m2 = ((data - mu)**2).mean() - m4 = ((data - mu)**4).mean() - return m4 / m2**2 - 3 - - -# Frozen RV class -class rv_frozen(object): - ''' Frozen continous or discrete 1D Random Variable object (RV) - - Methods - ------- - RV.rvs(size=1) - - random variates - - RV.pdf(x) - - probability density function (continous case) - - RV.pmf(x) - - probability mass function (discrete case) - - RV.cdf(x) - - cumulative density function - - RV.sf(x) - - survival function (1-cdf --- sometimes more accurate) - - RV.ppf(q) - - percent point function (inverse of cdf --- percentiles) - - RV.isf(q) - - inverse survival function (inverse of sf) - - RV.stats(moments='mv') - - mean('m'), variance('v'), skew('s'), and/or kurtosis('k') - - RV.entropy() - - (differential) entropy of the RV. - - Parameters - ---------- - x : array-like - quantiles - q : array-like - lower or upper tail probability - size : int or tuple of ints, optional, keyword - shape of random variates - moments : string, optional, keyword - one or more of 'm' mean, 'v' variance, 's' skewness, 'k' kurtosis - ''' - def __init__(self, dist, *args, **kwds): - self.dist = dist - loc0, scale0 = map(kwds.get, ['loc', 'scale']) - if hasattr(dist, 'fix_loc_scale'): #isinstance(dist, rv_continuous): - args, loc0, scale0 = dist.fix_loc_scale(args, loc0, scale0) - self.par = args + (loc0, scale0) - else: # rv_discrete - args, loc0 = dist.fix_loc(args, loc0) - self.par = args + (loc0,) - - def pdf(self, x): - ''' Probability density function at x of the given RV.''' - return self.dist.pdf(x, *self.par) - def logpdf(self, x): - return self.dist.logpdf(x, *self.par) - def cdf(self, x): - '''Cumulative distribution function at x of the given RV.''' - return self.dist.cdf(x, *self.par) - def logcdf(self, x): - return self.dist.logcdf(x, *self.par) - def ppf(self, q): - '''Percent point function (inverse of cdf) at q of the given RV.''' - return self.dist.ppf(q, *self.par) - def isf(self, q): - '''Inverse survival function at q of the given RV.''' - return self.dist.isf(q, *self.par) - def rvs(self, size=None): - '''Random variates of given type.''' - kwds = dict(size=size) - return self.dist.rvs(*self.par, **kwds) - def sf(self, x): - '''Survival function (1-cdf) at x of the given RV.''' - return self.dist.sf(x, *self.par) - def logsf(self, x): - return self.dist.logsf(x, *self.par) - def stats(self, moments='mv'): - ''' Some statistics of the given RV''' - kwds = dict(moments=moments) - return self.dist.stats(*self.par, **kwds) - def median(self): - return self.dist.median(*self.par) - def mean(self): - return self.dist.mean(*self.par) - def var(self): - return self.dist.var(*self.par) - def std(self): - return self.dist.std(*self.par) - def moment(self, n): - return self.dist.moment(n, *self.par) - def entropy(self): - return self.dist.entropy(*self.par) - def pmf(self, k): - '''Probability mass function at k of the given RV''' - return self.dist.pmf(k, *self.par) - def logpmf(self,k): - return self.dist.logpmf(k, *self.par) - def interval(self, alpha): - return self.dist.interval(alpha, *self.par) - -# Frozen RV class -class rv_frozen_old(object): - def __init__(self, dist, *args, **kwds): - self.args = args - self.kwds = kwds - self.dist = dist - - def pdf(self, x): # raises AttributeError in frozen discrete distribution - return self.dist.pdf(x, *self.args, **self.kwds) - - def logpdf(self, x): - return self.dist.logpdf(x, *self.args, **self.kwds) - - def cdf(self, x): - return self.dist.cdf(x, *self.args, **self.kwds) - - def logcdf(self, x): - return self.dist.logcdf(x, *self.args, **self.kwds) - - def ppf(self, q): - return self.dist.ppf(q, *self.args, **self.kwds) - - def isf(self, q): - return self.dist.isf(q, *self.args, **self.kwds) - - def rvs(self, size=None): - kwds = self.kwds.copy() - kwds.update({'size':size}) - return self.dist.rvs(*self.args, **kwds) - - def sf(self, x): - return self.dist.sf(x, *self.args, **self.kwds) - - def logsf(self, x): - return self.dist.logsf(x, *self.args, **self.kwds) - - def stats(self, moments='mv'): - kwds = self.kwds.copy() - kwds.update({'moments':moments}) - return self.dist.stats(*self.args, **kwds) - - def median(self): - return self.dist.median(*self.args, **self.kwds) - - def mean(self): - return self.dist.mean(*self.args, **self.kwds) - - def var(self): - return self.dist.var(*self.args, **self.kwds) - - def std(self): - return self.dist.std(*self.args, **self.kwds) - - def moment(self, n): - return self.dist.moment(n, *self.args, **self.kwds) - - def entropy(self): - return self.dist.entropy(*self.args, **self.kwds) - - def pmf(self,k): - return self.dist.pmf(k, *self.args, **self.kwds) - - def logpmf(self,k): - return self.dist.logpmf(k, *self.args, **self.kwds) - - def interval(self, alpha): - return self.dist.interval(alpha, *self.args, **self.kwds) - - -def stirlerr(n): - """ - Return error of Stirling approximation, - i.e., log(n!) - log( sqrt(2*pi*n)*(n/exp(1))**n ) - - Example - ------- - >>> stirlerr(2) - array([ 0.0413407]) - - See also - --------- - binom - - - Reference - ----------- - Catherine Loader (2000). - 'Fast and Accurate Computation of Binomial Probabilities' - - """ - - S0 = 0.083333333333333333333 # /* 1/12 */ - S1 = 0.00277777777777777777778 # /* 1/360 */ - S2 = 0.00079365079365079365079365 # /* 1/1260 */ - S3 = 0.000595238095238095238095238 # /* 1/1680 */ - S4 = 0.0008417508417508417508417508 # /* 1/1188 */ - - logical_and = numpy.logical_and - atleast_1d = numpy.atleast_1d - gammaln = special.gammaln - pi = numpy.pi - exp = numpy.exp - sqrt = numpy.sqrt - log = numpy.log - - n1 = atleast_1d(n) -# if numpy.isscalar(n): -# n1 = asfarray([n]) -# else: -# n1 = asfarray(n) - - y = gammaln(n1 + 1) - log(sqrt(2 * pi * n1) * (n1 / exp(1)) ** n1) - - - nn = n1 * n1 - - n500 = 500 < n1 - y[n500] = (S0 - S1 / nn[n500]) / n1[n500] - n80 = logical_and(80 < n1 , n1 <= 500) - if any(n80): - y[n80] = (S0 - (S1 - S2 / nn[n80]) / nn[n80]) / n1[n80] - n35 = logical_and(35 < n1, n1 <= 80) - if any(n35): - nn35 = nn[n35] - y[n35] = (S0 - (S1 - (S2 - S3 / nn35) / nn35) / nn35) / n1[n35] - - n15 = logical_and(15 < n1, n1 <= 35) - if any(n15): - nn15 = nn[n15] - y[n15] = (S0 - (S1 - (S2 - (S3 - S4 / nn15) / nn15) / nn15) / nn15) / n1[n15] - - return y - -def bd0(x, npr): - """ - Return deviance term x*log(x/npr) + npr - x - - See also - -------- - stirlerr, - binom.pmf, - poisson.pmf - - Reference - --------- - Catherine Loader (2000). - 'Fast and Accurate Computation of Binomial Probabilities' - - """ - def bd0_iter(x, np1): - xmnp = x - np1 - v = (xmnp) / (x + np1) - s1 = (xmnp) * v - s = np.zeros_like(s1) - ej = 2 * x * v - #v2 = v*v - v = v * v - j = 0 - ix, = (s != s1).nonzero() - while ix.size > 0: - j += 1 - s[ix] = s1[ix].copy() - ej[ix] = ej[ix] * v[ix] - s1[ix] = s[ix] + ej[ix] / (2. * j + 1.0) - ix, = (s1 != s).nonzero() - return s1 - x1, npr1 = atleast_1d(x, npr) - y = x1 * log(x1 / npr1) + npr1 - x1 - sml = nonzero(abs(x1 - npr1) < 0.1 * (x1 + npr1)) - if sml.size > 0: - if x1.size != 1: - x1 = x1[sml] - if npr1.size != 1: - npr1 = npr1[sml] - y.put(sml, bd0_iter(x1, npr1)) - return y - -## NANs are returned for unsupported parameters. -## location and scale parameters are optional for each distribution. -## The shape parameters are generally required -## -## The loc and scale parameters must be given as keyword parameters. -## These are related to the common symbols in the .lyx file - -## skew is third central moment / variance**(1.5) -## kurtosis is fourth central moment / variance**2 - 3 - - -## References:: - -## Documentation for ranlib, rv2, cdflib and -## -## Eric Weisstein's world of mathematics http://mathworld.wolfram.com/ -## http://mathworld.wolfram.com/topics/StatisticalDistributions.html -## -## Documentation to Regress+ by Michael McLaughlin -## -## Engineering and Statistics Handbook (NIST) -## http://www.itl.nist.gov/div898/handbook/index.htm -## -## Documentation for DATAPLOT from NIST -## http://www.itl.nist.gov/div898/software/dataplot/distribu.htm -## -## Norman Johnson, Samuel Kotz, and N. Balakrishnan "Continuous -## Univariate Distributions", second edition, -## Volumes I and II, Wiley & Sons, 1994. - - - -def valarray(shape,value=nan,typecode=None): - """Return an array of all value. - """ - - out = ones(shape, dtype=bool) * value - if typecode is not None: - out = out.astype(typecode) - if not isinstance(out, ndarray): - out = asarray(out) - return out - - -# This should be rewritten -def argsreduce(cond, *args): - """Return the sequence of ravel(args[i]) where ravel(condition) is - True in 1D. - - Examples - -------- - >>> import numpy as np - >>> rand = np.random.random_sample - >>> A = rand((4,5)) - >>> B = 2 - >>> C = rand((1,5)) - >>> cond = np.ones(A.shape) - >>> [A1,B1,C1] = argsreduce(cond,A,B,C) - >>> B1.shape - (20,) - >>> cond[2,:] = 0 - >>> [A2,B2,C2] = argsreduce(cond,A,B,C) - >>> B2.shape - (15,) - - """ - newargs = atleast_1d(*args) - if not isinstance(newargs, list): - newargs = [newargs,] - expand_arr = (cond == cond) - return [extract(cond, arr1 * expand_arr) for arr1 in newargs] - - - -parse_arg_template = """ -def _parse_args(self, %(shape_arg_str)s %(locscale_in)s): - return (%(shape_arg_str)s), %(locscale_out)s - -def _parse_args_rvs(self, %(shape_arg_str)s %(locscale_in)s, size=None): - return (%(shape_arg_str)s), %(locscale_out)s, size - -def _parse_args_stats(self, %(shape_arg_str)s %(locscale_in)s, moments='mv'): - return (%(shape_arg_str)s), %(locscale_out)s, moments -""" - - -def common_shape(*args, **kwds): - ''' Return the common shape of a sequence of arrays - - Parameters - ----------- - *args : arraylike - sequence of arrays - **kwds : - shape - - Returns - ------- - shape : tuple - common shape of the elements of args. - - Raises - ------ - An error is raised if some of the arrays do not conform - to the common shape according to the broadcasting rules in numpy. - - Examples - -------- - >>> import numpy as np - >>> A = np.ones((4,1)) - >>> B = 2 - >>> C = np.ones((1,5))*5 - >>> common_shape(A,B,C) - (4, 5) - >>> common_shape(A,B,C,shape=(3,4,1)) - (3, 4, 5) - - See also - -------- - broadcast, broadcast_arrays - ''' - - - shape = kwds.get('shape') - argsout = atleast_1d(*args) - if not isinstance(argsout, list): - argsout = [argsout, ] - args_shape = [arg.shape for arg in argsout] #map(shape, varargout) - if shape != None: - if not isinstance(shape, (list, tuple)): - shape = (shape,) - args_shape.append(tuple(shape)) - - if len(set(args_shape)) == 1: - # Common case - return tuple(args_shape[0]) - - ndims = map(len, args_shape) - ndim = max(ndims) - Np = len(args_shape) - - all_shapes = ones((Np, ndim), dtype=int) - for ix, Nt in enumerate(ndims): - all_shapes[ix, ndim - Nt::] = args_shape[ix] - - ndims = atleast_1d(ndims) - if any(ndims == 0): - all_shapes[ndims == 0, :] = 0 - - comn_shape = all_shapes.max(axis=0) - - arrays_do_not_conform2common_shape = any(logical_and(all_shapes != comn_shape[newaxis, ...], all_shapes != 1), axis=1) - - if any(arrays_do_not_conform2common_shape): - raise ValueError('Non-scalar input arguments do not match in shape according to numpy broadcasting rules') - - return tuple(comn_shape) - - -class rv_generic(object): - """Class which encapsulates common functionality between rv_discrete - and rv_continuous. - - """ - def _construct_argparser(self, names_to_inspect, locscale_in, locscale_out): - """Construct the parser for the shape arguments. - - Generates the argument-parsing functions dynamically. - Modifies the calling class. - Is supposed to be called in __init__ of a class for each distribution. - - If self.shapes is a non-empty string, interprets it as a comma-separated - list of shape parameters. - - Otherwise inspects the call signatures of `names_to_inspect` - and constructs the argument-parsing functions from these. - In this case also sets `shapes` and `numargs`. - """ - - if self.shapes: - # sanitize the user-supplied shapes - if not isinstance(self.shapes, string_types): - raise TypeError('shapes must be a string.') - - shapes = self.shapes.replace(',', ' ').split() - - for field in shapes: - if keyword.iskeyword(field): - raise SyntaxError('keywords cannot be used as shapes.') - if not re.match('^[_a-zA-Z][_a-zA-Z0-9]*$', field): - raise SyntaxError('shapes must be valid python identifiers') - else: - # find out the call signatures (_pdf, _cdf etc), deduce shape arguments - shapes_list = [] - for name in names_to_inspect: - # look for names in instance methods, then global namespace - # the latter is needed for rv_discrete with explicit `values` - try: - meth = get_method_function(getattr(self, name)) - except: - meth = globals()[name] - shapes_args = inspect.getargspec(meth) - shapes_list.append(shapes_args.args) - - # *args or **kwargs are not allowed w/automatic shapes - # (generic methods have 'self, x' only) - if len(shapes_args.args) > 2: - if shapes_args.varargs is not None: - raise TypeError('*args are not allowed w/out explicit shapes') - if shapes_args.keywords is not None: - raise TypeError('**kwds are not allowed w/out explicit shapes') - if shapes_args.defaults is not None: - raise TypeError('defaults are not allowed for shapes') - - shapes = max(shapes_list, key=lambda x: len(x)) - shapes = shapes[2:] # remove self, x, - - # make sure the signatures are consistent - # (generic methods have 'self, x' only) - for item in shapes_list: - if len(item) > 2 and item[2:] != shapes: - raise TypeError('Shape arguments are inconsistent.') - - # have the arguments, construct the method from template - shapes_str = ', '.join(shapes) + ', ' if shapes else '' # NB: not None - dct = dict(shape_arg_str=shapes_str, - locscale_in=locscale_in, - locscale_out=locscale_out, - ) - ns = {} - exec_(parse_arg_template % dct, ns) - # NB: attach to the instance, not class - for name in ['_parse_args', '_parse_args_stats', '_parse_args_rvs']: - setattr(self, name, - instancemethod(ns[name], self, self.__class__) - ) - - self.shapes = ', '.join(shapes) if shapes else None - if not hasattr(self, 'numargs'): - # allows more general subclassing with *args - self.numargs = len(shapes) - - def _fix_loc_scale(self, args, loc, scale=1): - """Parse args/kwargs input to other methods.""" - args, loc, scale, kwarg3 = self._fix_loc_scale_kwarg3(args, loc, scale, - None, None) - if kwarg3 is not None: - # 3 positional args - raise TypeError("Too many input arguments.") - - return args, loc, scale - - def _fix_loc_scale_kwarg3(self, args, loc, scale=1, - kwarg3=1, kwarg3_default=None): - """Parse args/kwargs input to methods with a third kwarg. - - At the moment these methods are ``stats`` and ``rvs``. - """ - N = len(args) - if N > self.numargs: - if N == self.numargs + 1 and loc is None: - # loc is given without keyword - loc = args[-1] - elif N == self.numargs + 2 and loc is None and scale is None: - # loc and scale given without keyword - loc, scale = args[-2:] - elif N == self.numargs + 3 and loc is None and scale is None \ - and kwarg3 is None: - # loc, scale and a third argument - loc, scale, kwarg3 = args[-3:] - else: - raise TypeError("Too many input arguments.") - - args = args[:self.numargs] - - if scale is None: - scale = 1.0 - if loc is None: - loc = 0.0 - if kwarg3 is None: - kwarg3 = kwarg3_default - - return args, loc, scale, kwarg3 - - def _fix_loc(self, args, loc): - args, loc, _scale = self._fix_loc_scale(args, loc) - return args, loc - - # These are actually called, and should not be overwritten if you - # want to keep error checking. - def rvs(self, *args, **kwds): - """ - Random variates of given type. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - scale : array_like, optional - Scale parameter (default=1). - size : int or tuple of ints, optional - Defining number of random variates (default=1). - - Returns - ------- - rvs : ndarray or scalar - Random variates of given `size`. - - """ - discrete = kwds.pop('discrete', None) - args, loc, scale, size = self._parse_args_rvs(*args, **kwds) - cond = logical_and(self._argcheck(*args), (scale >= 0)) - if not all(cond): - raise ValueError("Domain error in arguments.") - - # self._size is total size of all output values - self._size = product(size, axis=0) - if self._size is not None and self._size > 1: - size = numpy.array(size, ndmin=1) - - if np.all(scale == 0): - return loc*ones(size, 'd') - - vals = self._rvs(*args) - if self._size is not None: - vals = reshape(vals, size) - - vals = vals * scale + loc - - # Cast to int if discrete - if discrete: - if numpy.isscalar(vals): - vals = int(vals) - else: - vals = vals.astype(int) - - return vals - - def median(self, *args, **kwds): - """ - Median of the distribution. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - Location parameter, Default is 0. - scale : array_like, optional - Scale parameter, Default is 1. - - Returns - ------- - median : float - The median of the distribution. - - See Also - -------- - stats.distributions.rv_discrete.ppf - Inverse of the CDF - - """ - return self.ppf(0.5, *args, **kwds) - - def mean(self, *args, **kwds): - """ - Mean of the distribution - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - mean : float - the mean of the distribution - """ - kwds['moments'] = 'm' - res = self.stats(*args, **kwds) - if isinstance(res, ndarray) and res.ndim == 0: - return res[()] - return res - - def var(self, *args, **kwds): - """ - Variance of the distribution - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - var : float - the variance of the distribution - - """ - kwds['moments'] = 'v' - res = self.stats(*args, **kwds) - if isinstance(res, ndarray) and res.ndim == 0: - return res[()] - return res - - def std(self, *args, **kwds): - """ - Standard deviation of the distribution. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - std : float - standard deviation of the distribution - - """ - kwds['moments'] = 'v' - res = sqrt(self.stats(*args, **kwds)) - return res - - def interval(self, alpha, *args, **kwds): - """ - Confidence interval with equal areas around the median. - - Parameters - ---------- - alpha : array_like of float - Probability that an rv will be drawn from the returned range. - Each value should be in the range [0, 1]. - arg1, arg2, ... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - location parameter, Default is 0. - scale : array_like, optional - scale parameter, Default is 1. - - Returns - ------- - a, b : ndarray of float - end-points of range that contain ``100 * alpha %`` of the rv's possible - values. - - """ - alpha = asarray(alpha) - if any((alpha > 1) | (alpha < 0)): - raise ValueError("alpha must be between 0 and 1 inclusive") - q1 = (1.0-alpha)/2 - q2 = (1.0+alpha)/2 - a = self.ppf(q1, *args, **kwds) - b = self.ppf(q2, *args, **kwds) - return a, b - - -## continuous random variables: implement maybe later -## -## hf --- Hazard Function (PDF / SF) -## chf --- Cumulative hazard function (-log(SF)) -## psf --- Probability sparsity function (reciprocal of the pdf) in -## units of percent-point-function (as a function of q). -## Also, the derivative of the percent-point function. - -class rv_continuous(rv_generic): - """ - A generic continuous random variable class meant for subclassing. - - `rv_continuous` is a base class to construct specific distribution classes - and instances from for continuous random variables. It cannot be used - directly as a distribution. - - Parameters - ---------- - momtype : int, optional - The type of generic moment calculation to use: 0 for pdf, 1 (default) - for ppf. - a : float, optional - Lower bound of the support of the distribution, default is minus - infinity. - b : float, optional - Upper bound of the support of the distribution, default is plus - infinity. - xtol : float, optional - The tolerance for fixed point calculation for generic ppf. - badvalue : object, optional - The value in a result arrays that indicates a value that for which - some argument restriction is violated, default is np.nan. - name : str, optional - The name of the instance. This string is used to construct the default - example for distributions. - longname : str, optional - This string is used as part of the first line of the docstring returned - when a subclass has no docstring of its own. Note: `longname` exists - for backwards compatibility, do not use for new subclasses. - shapes : str, optional - The shape of the distribution. For example ``"m, n"`` for a - distribution that takes two integers as the two shape arguments for all - its methods. - extradoc : str, optional, deprecated - This string is used as the last part of the docstring returned when a - subclass has no docstring of its own. Note: `extradoc` exists for - backwards compatibility, do not use for new subclasses. - - Methods - ------- - rvs(, loc=0, scale=1, size=1) - random variates - - pdf(x, , loc=0, scale=1) - probability density function - - logpdf(x, , loc=0, scale=1) - log of the probability density function - - cdf(x, , loc=0, scale=1) - cumulative density function - - logcdf(x, , loc=0, scale=1) - log of the cumulative density function - - sf(x, , loc=0, scale=1) - survival function (1-cdf --- sometimes more accurate) - - logsf(x, , loc=0, scale=1) - log of the survival function - - ppf(q, , loc=0, scale=1) - percent point function (inverse of cdf --- quantiles) - - isf(q, , loc=0, scale=1) - inverse survival function (inverse of sf) - - moment(n, , loc=0, scale=1) - non-central n-th moment of the distribution. May not work for array arguments. - - stats(, loc=0, scale=1, moments='mv') - mean('m'), variance('v'), skew('s'), and/or kurtosis('k') - - entropy(, loc=0, scale=1) - (differential) entropy of the RV. - - fit(data, , loc=0, scale=1) - Parameter estimates for generic data - - expect(func=None, args=(), loc=0, scale=1, lb=None, ub=None, - conditional=False, **kwds) - Expected value of a function with respect to the distribution. - Additional kwd arguments passed to integrate.quad - - median(, loc=0, scale=1) - Median of the distribution. - - mean(, loc=0, scale=1) - Mean of the distribution. - - std(, loc=0, scale=1) - Standard deviation of the distribution. - - var(, loc=0, scale=1) - Variance of the distribution. - - interval(alpha, , loc=0, scale=1) - Interval that with `alpha` percent probability contains a random - realization of this distribution. - - __call__(, loc=0, scale=1) - Calling a distribution instance creates a frozen RV object with the - same methods but holding the given shape, location, and scale fixed. - See Notes section. - - **Parameters for Methods** - - x : array_like - quantiles - q : array_like - lower or upper tail probability - : array_like - shape parameters - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - size : int or tuple of ints, optional - shape of random variates (default computed from input arguments ) - moments : string, optional - composed of letters ['mvsk'] specifying which moments to compute where - 'm' = mean, 'v' = variance, 's' = (Fisher's) skew and - 'k' = (Fisher's) kurtosis. (default='mv') - n : int - order of moment to calculate in method moments - - Notes - ----- - - **Methods that can be overwritten by subclasses** - :: - - _rvs - _pdf - _cdf - _sf - _ppf - _isf - _stats - _munp - _entropy - _argcheck - - There are additional (internal and private) generic methods that can - be useful for cross-checking and for debugging, but might work in all - cases when directly called. - - **Frozen Distribution** - - Alternatively, the object may be called (as a function) to fix the shape, - location, and scale parameters returning a "frozen" continuous RV object: - - rv = generic(, loc=0, scale=1) - frozen RV object with the same methods but holding the given shape, - location, and scale fixed - - **Subclassing** - - New random variables can be defined by subclassing rv_continuous class - and re-defining at least the ``_pdf`` or the ``_cdf`` method (normalized - to location 0 and scale 1) which will be given clean arguments (in between - a and b) and passing the argument check method. - - If positive argument checking is not correct for your RV - then you will also need to re-define the ``_argcheck`` method. - - Correct, but potentially slow defaults exist for the remaining - methods but for speed and/or accuracy you can over-ride:: - - _logpdf, _cdf, _logcdf, _ppf, _rvs, _isf, _sf, _logsf - - Rarely would you override ``_isf``, ``_sf`` or ``_logsf``, but you could. - - Statistics are computed using numerical integration by default. - For speed you can redefine this using ``_stats``: - - - take shape parameters and return mu, mu2, g1, g2 - - If you can't compute one of these, return it as None - - Can also be defined with a keyword argument ``moments=``, - where is a string composed of 'm', 'v', 's', - and/or 'k'. Only the components appearing in string - should be computed and returned in the order 'm', 'v', - 's', or 'k' with missing values returned as None. - - Alternatively, you can override ``_munp``, which takes n and shape - parameters and returns the nth non-central moment of the distribution. - - A note on ``shapes``: subclasses need not specify them explicitly. In this - case, the `shapes` will be automatically deduced from the signatures of the - overridden methods. - If, for some reason, you prefer to avoid relying on introspection, you can - specify ``shapes`` explicitly as an argument to the instance constructor. - - Examples - -------- - To create a new Gaussian distribution, we would do the following:: - - class gaussian_gen(rv_continuous): - "Gaussian distribution" - def _pdf(self, x): - ... - ... - - """ - - def __init__(self, momtype=1, a=None, b=None, xtol=1e-14, - badvalue=None, name=None, longname=None, - shapes=None, extradoc=None): - - rv_generic.__init__(self) - self.fix_loc_scale = self._fix_loc_scale - - if badvalue is None: - badvalue = nan - if name is None: - name = 'Distribution' - self.badvalue = badvalue - self.name = name - self.a = a - self.b = b - if a is None: - self.a = -inf - if b is None: - self.b = inf - self.xtol = xtol - self._size = 1 - self.m = 0.0 - self.moment_type = momtype - - self.expandarr = 1 - - self.shapes = shapes - self._construct_argparser(names_to_inspect=['_pdf', '_cdf'], - locscale_in='loc=0, scale=1', - locscale_out='loc, scale') - - # nin correction - self.vecfunc = vectorize(self._ppf_single_call, otypes='d') - self.vecfunc.nin = self.numargs + 1 - self.vecentropy = vectorize(self._entropy, otypes='d') - self.vecentropy.nin = self.numargs + 1 - self.veccdf = vectorize(self._cdf_single_call, otypes='d') - self.veccdf.nin = self.numargs + 1 - - self.extradoc = extradoc - if momtype == 0: - self.generic_moment = vectorize(self._mom0_sc, otypes='d') - else: - self.generic_moment = vectorize(self._mom1_sc, otypes='d') - self.generic_moment.nin = self.numargs+1 # Because of the *args argument - # of _mom0_sc, vectorize cannot count the number of arguments correctly. - - if longname is None: - if name[0] in ['aeiouAEIOU']: - hstr = "An " - else: - hstr = "A " - longname = hstr + name - - if sys.flags.optimize < 2: - # Skip adding docstrings if interpreter is run with -OO - if self.__doc__ is None: - self._construct_default_doc(longname=longname, extradoc=extradoc) - else: - self._construct_doc() - - ## This only works for old-style classes... - # self.__class__.__doc__ = self.__doc__ - - - def _construct_default_doc(self, longname=None, extradoc=None): - """Construct instance docstring from the default template.""" - if longname is None: - longname = 'A' - if extradoc is None: - extradoc = '' - if extradoc.startswith('\n\n'): - extradoc = extradoc[2:] - self.__doc__ = ''.join(['%s continuous random variable.' % longname, - '\n\n%(before_notes)s\n', docheaders['notes'], - extradoc, '\n%(example)s']) - self._construct_doc() - - def _construct_doc(self): - """Construct the instance docstring with string substitutions.""" - tempdict = docdict.copy() - tempdict['name'] = self.name or 'distname' - tempdict['shapes'] = self.shapes or '' - - if self.shapes is None: - # remove shapes from call parameters if there are none - for item in ['callparams', 'default', 'before_notes']: - tempdict[item] = tempdict[item].replace( - "\n%(shapes)s : array_like\n shape parameters", "") - for i in range(2): - if self.shapes is None: - # necessary because we use %(shapes)s in two forms (w w/o ", ") - self.__doc__ = self.__doc__.replace("%(shapes)s, ", "") - self.__doc__ = doccer.docformat(self.__doc__, tempdict) - - def _ppf_to_solve(self, x, q,*args): - return self.cdf(*(x, )+args)-q - - def _ppf_single_call(self, q, *args): - left = right = None - if self.a > -np.inf: - left = self.a - if self.b < np.inf: - right = self.b - - factor = 10. - if not left: # i.e. self.a = -inf - left = -1.*factor - while self._ppf_to_solve(left, q,*args) > 0.: - right = left - left *= factor - # left is now such that cdf(left) < q - if not right: # i.e. self.b = inf - right = factor - while self._ppf_to_solve(right, q,*args) < 0.: - left = right - right *= factor - # right is now such that cdf(right) > q - - return optimize.brentq(self._ppf_to_solve, - left, right, args=(q,)+args, xtol=self.xtol) - - # moment from definition - def _mom_integ0(self, x,m,*args): - return x**m * self.pdf(x,*args) - - def _mom0_sc(self, m,*args): - return integrate.quad(self._mom_integ0, self.a, - self.b, args=(m,)+args)[0] - - # moment calculated using ppf - def _mom_integ1(self, q,m,*args): - return (self.ppf(q,*args))**m - - def _mom1_sc(self, m,*args): - return integrate.quad(self._mom_integ1, 0, 1,args=(m,)+args)[0] - - ## These are the methods you must define (standard form functions) - def _argcheck(self, *args): - # Default check for correct values on args and keywords. - # Returns condition array of 1's where arguments are correct and - # 0's where they are not. - cond = 1 - for arg in args: - cond = logical_and(cond,(asarray(arg) > 0)) - return cond - - def _pdf(self,x,*args): - return derivative(self._cdf,x,dx=1e-5,args=args,order=5) - - ## Could also define any of these - def _logpdf(self, x, *args): - return log(self._pdf(x, *args)) - - ##(return 1-d using self._size to get number) - def _rvs(self, *args): - ## Use basic inverse cdf algorithm for RV generation as default. - U = mtrand.sample(self._size) - Y = self._ppf(U,*args) - return Y - - def _cdf_single_call(self, x, *args): - return integrate.quad(self._pdf, self.a, x, args=args)[0] - - def _cdf(self, x, *args): - return self.veccdf(x,*args) - - def _logcdf(self, x, *args): - return log(self._cdf(x, *args)) - - def _sf(self, x, *args): - return 1.0-self._cdf(x,*args) - - def _logsf(self, x, *args): - return log(self._sf(x, *args)) - - def _chf(self, x, *args): - return - log1p(-self._cdf(x, *args)) - - def _ppf(self, q, *args): - return self.vecfunc(q,*args) - - def _isf(self, q, *args): - return self._ppf(1.0-q,*args) # use correct _ppf for subclasses - - # The actual calculation functions (no basic checking need be done) - # If these are defined, the others won't be looked at. - # Otherwise, the other set can be defined. - def _stats(self,*args, **kwds): - return None, None, None, None - - # Central moments - def _munp(self,n,*args): - return self.generic_moment(n,*args) - - def pdf(self,x,*args,**kwds): - """ - Probability density function at x of the given RV. - - Parameters - ---------- - x : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - pdf : ndarray - Probability density function evaluated at x - - """ - args, loc, scale = self._parse_args(*args, **kwds) - x,loc,scale = map(asarray,(x,loc,scale)) - args = tuple(map(asarray,args)) - x = asarray((x-loc)*1.0/scale) - cond0 = self._argcheck(*args) & (scale > 0) - cond1 = (scale > 0) & (x >= self.a) & (x <= self.b) - cond = cond0 & cond1 - output = zeros(shape(cond),'d') - putmask(output,(1-cond0)+np.isnan(x),self.badvalue) - if any(cond): - goodargs = argsreduce(cond, *((x,)+args+(scale,))) - scale, goodargs = goodargs[-1], goodargs[:-1] - place(output,cond,self._pdf(*goodargs) / scale) - if output.ndim == 0: - return output[()] - return output - - def logpdf(self, x, *args, **kwds): - """ - Log of the probability density function at x of the given RV. - - This uses a more numerically accurate calculation if available. - - Parameters - ---------- - x : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - logpdf : array_like - Log of the probability density function evaluated at x - - """ - args, loc, scale = self._parse_args(*args, **kwds) - x,loc,scale = map(asarray,(x,loc,scale)) - args = tuple(map(asarray,args)) - x = asarray((x-loc)*1.0/scale) - cond0 = self._argcheck(*args) & (scale > 0) - cond1 = (scale > 0) & (x >= self.a) & (x <= self.b) - cond = cond0 & cond1 - output = empty(shape(cond),'d') - output.fill(NINF) - putmask(output,(1-cond0)+np.isnan(x),self.badvalue) - if any(cond): - goodargs = argsreduce(cond, *((x,)+args+(scale,))) - scale, goodargs = goodargs[-1], goodargs[:-1] - place(output,cond,self._logpdf(*goodargs) - log(scale)) - if output.ndim == 0: - return output[()] - return output - - def cdf(self,x,*args,**kwds): - """ - Cumulative distribution function of the given RV. - - Parameters - ---------- - x : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - cdf : ndarray - Cumulative distribution function evaluated at `x` - - """ - args, loc, scale = self._parse_args(*args, **kwds) - x,loc,scale = map(asarray,(x,loc,scale)) - args = tuple(map(asarray,args)) - x = (x-loc)*1.0/scale - cond0 = self._argcheck(*args) & (scale > 0) - cond1 = (scale > 0) & (x > self.a) & (x < self.b) - cond2 = (x >= self.b) & cond0 - cond = cond0 & cond1 - output = zeros(shape(cond),'d') - place(output,(1-cond0)+np.isnan(x),self.badvalue) - place(output,cond2,1.0) - if any(cond): # call only if at least 1 entry - goodargs = argsreduce(cond, *((x,)+args)) - place(output,cond,self._cdf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def logcdf(self,x,*args,**kwds): - """ - Log of the cumulative distribution function at x of the given RV. - - Parameters - ---------- - x : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - logcdf : array_like - Log of the cumulative distribution function evaluated at x - - """ - args, loc, scale = self._parse_args(*args, **kwds) - x,loc,scale = map(asarray,(x,loc,scale)) - args = tuple(map(asarray,args)) - x = (x-loc)*1.0/scale - cond0 = self._argcheck(*args) & (scale > 0) - cond1 = (scale > 0) & (x > self.a) & (x < self.b) - cond2 = (x >= self.b) & cond0 - cond = cond0 & cond1 - output = empty(shape(cond),'d') - output.fill(NINF) - place(output,(1-cond0)*(cond1 == cond1)+np.isnan(x),self.badvalue) - place(output,cond2,0.0) - if any(cond): # call only if at least 1 entry - goodargs = argsreduce(cond, *((x,)+args)) - place(output,cond,self._logcdf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def sf(self,x,*args,**kwds): - """ - Survival function (1-cdf) at x of the given RV. - - Parameters - ---------- - x : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - sf : array_like - Survival function evaluated at x - - """ - args, loc, scale = self._parse_args(*args, **kwds) - x,loc,scale = map(asarray,(x,loc,scale)) - args = tuple(map(asarray,args)) - x = (x-loc)*1.0/scale - cond0 = self._argcheck(*args) & (scale > 0) - cond1 = (scale > 0) & (x > self.a) & (x < self.b) - cond2 = cond0 & (x <= self.a) - cond = cond0 & cond1 - output = zeros(shape(cond),'d') - place(output,(1-cond0)+np.isnan(x),self.badvalue) - place(output,cond2,1.0) - if any(cond): - goodargs = argsreduce(cond, *((x,)+args)) - place(output,cond,self._sf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def logsf(self,x,*args,**kwds): - """ - Log of the survival function of the given RV. - - Returns the log of the "survival function," defined as (1 - `cdf`), - evaluated at `x`. - - Parameters - ---------- - x : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - logsf : ndarray - Log of the survival function evaluated at `x`. - - """ - args, loc, scale = self._parse_args(*args, **kwds) - x,loc,scale = map(asarray,(x,loc,scale)) - args = tuple(map(asarray,args)) - x = (x-loc)*1.0/scale - cond0 = self._argcheck(*args) & (scale > 0) - cond1 = (scale > 0) & (x > self.a) & (x < self.b) - cond2 = cond0 & (x <= self.a) - cond = cond0 & cond1 - output = empty(shape(cond),'d') - output.fill(NINF) - place(output,(1-cond0)+np.isnan(x),self.badvalue) - place(output,cond2,0.0) - if any(cond): - goodargs = argsreduce(cond, *((x,)+args)) - place(output,cond,self._logsf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def ppf(self,q,*args,**kwds): - """ - Percent point function (inverse of cdf) at q of the given RV. - - Parameters - ---------- - q : array_like - lower tail probability - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - x : array_like - quantile corresponding to the lower tail probability q. - - """ - args, loc, scale = self._parse_args(*args, **kwds) - q, loc, scale = map(asarray,(q, loc, scale)) - args = tuple(map(asarray, args)) - cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc) - cond1 = (0 < q) & (q < 1) - cond2 = cond0 & (q == 0) - cond3 = cond0 & (q == 1) - cond = cond0 & cond1 - output = valarray(shape(cond), value=self.badvalue) - - lower_bound = self.a * scale + loc - upper_bound = self.b * scale + loc - place(output, cond2, argsreduce(cond2, lower_bound)[0]) - place(output, cond3, argsreduce(cond3, upper_bound)[0]) - - if any(cond): # call only if at least 1 entry - goodargs = argsreduce(cond, *((q,)+args+(scale,loc))) - scale, loc, goodargs = goodargs[-2], goodargs[-1], goodargs[:-2] - place(output, cond, self._ppf(*goodargs) * scale + loc) - if output.ndim == 0: - return output[()] - return output - - def isf(self, q, *args, **kwds): - """ - Inverse survival function at q of the given RV. - - Parameters - ---------- - q : array_like - upper tail probability - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - - Returns - ------- - x : ndarray or scalar - Quantile corresponding to the upper tail probability q. - - """ - args, loc, scale = self._parse_args(*args, **kwds) - q, loc, scale = map(asarray, (q, loc, scale)) - args = tuple(map(asarray, args)) - cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc) - cond1 = (0 < q) & (q < 1) - cond2 = cond0 & (q == 1) - cond3 = cond0 & (q == 0) - cond = cond0 & cond1 - output = valarray(shape(cond), value=self.badvalue) - - lower_bound = self.a * scale + loc - upper_bound = self.b * scale + loc - place(output, cond2, argsreduce(cond2, lower_bound)[0]) - place(output, cond3, argsreduce(cond3, upper_bound)[0]) - - if any(cond): - goodargs = argsreduce(cond, *((q,)+args+(scale,loc))) - scale, loc, goodargs = goodargs[-2], goodargs[-1], goodargs[:-2] - place(output, cond, self._isf(*goodargs) * scale + loc) - if output.ndim == 0: - return output[()] - return output - - def stats(self,*args,**kwds): - """ - Some statistics of the given RV - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - location parameter (default=0) - scale : array_like, optional - scale parameter (default=1) - moments : str, optional - composed of letters ['mvsk'] defining which moments to compute: - 'm' = mean, - 'v' = variance, - 's' = (Fisher's) skew, - 'k' = (Fisher's) kurtosis. - (default='mv') - - Returns - ------- - stats : sequence - of requested moments. - - """ - args, loc, scale, moments = self._parse_args_stats(*args, **kwds) - loc, scale = map(asarray, (loc, scale)) - args = tuple(map(asarray, args)) - cond = self._argcheck(*args) & (scale > 0) & (loc == loc) - - signature = inspect.getargspec(get_method_function(self._stats)) - if (signature[2] is not None) or ('moments' in signature[0]): - mu, mu2, g1, g2 = self._stats(*args,**{'moments':moments}) - else: - mu, mu2, g1, g2 = self._stats(*args) - if g1 is None: - mu3 = None - else: - mu3 = g1*np.power(mu2,1.5) # (mu2**1.5) breaks down for nan and inf - default = valarray(shape(cond), self.badvalue) - output = [] - - # Use only entries that are valid in calculation - if any(cond): - goodargs = argsreduce(cond, *(args+(scale,loc))) - scale, loc, goodargs = goodargs[-2], goodargs[-1], goodargs[:-2] - if 'm' in moments: - if mu is None: - mu = self._munp(1.0,*goodargs) - out0 = default.copy() - place(out0,cond,mu*scale+loc) - output.append(out0) - - if 'v' in moments: - if mu2 is None: - mu2p = self._munp(2.0,*goodargs) - if mu is None: - mu = self._munp(1.0,*goodargs) - mu2 = mu2p - mu*mu - if np.isinf(mu): - #if mean is inf then var is also inf - mu2 = np.inf - out0 = default.copy() - place(out0,cond,mu2*scale*scale) - output.append(out0) - - if 's' in moments: - if g1 is None: - mu3p = self._munp(3.0,*goodargs) - if mu is None: - mu = self._munp(1.0,*goodargs) - if mu2 is None: - mu2p = self._munp(2.0,*goodargs) - mu2 = mu2p - mu*mu - mu3 = mu3p - 3*mu*mu2 - mu**3 - g1 = mu3 / np.power(mu2, 1.5) - out0 = default.copy() - place(out0,cond,g1) - output.append(out0) - - if 'k' in moments: - if g2 is None: - mu4p = self._munp(4.0,*goodargs) - if mu is None: - mu = self._munp(1.0,*goodargs) - if mu2 is None: - mu2p = self._munp(2.0,*goodargs) - mu2 = mu2p - mu*mu - if mu3 is None: - mu3p = self._munp(3.0,*goodargs) - mu3 = mu3p - 3*mu*mu2 - mu**3 - mu4 = mu4p - 4*mu*mu3 - 6*mu*mu*mu2 - mu**4 - g2 = mu4 / mu2**2.0 - 3.0 - out0 = default.copy() - place(out0,cond,g2) - output.append(out0) - else: # no valid args - output = [] - for _ in moments: - out0 = default.copy() - output.append(out0) - - if len(output) == 1: - return output[0] - else: - return tuple(output) - - def moment(self, n, *args, **kwds): - """ - n'th order non-central moment of distribution. - - Parameters - ---------- - n : int, n>=1 - Order of moment. - arg1, arg2, arg3,... : float - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - kwds : keyword arguments, optional - These can include "loc" and "scale", as well as other keyword - arguments relevant for a given distribution. - - """ - args, loc, scale = self._parse_args(*args, **kwds) -# loc = kwds.get('loc', None) -# scale = kwds.get('scale', None) -# args, loc, scale = self.fix_loc_scale(args, loc, scale) - if not (self._argcheck(*args) and (scale > 0)): - return nan - if (floor(n) != n): - raise ValueError("Moment must be an integer.") - if (n < 0): - raise ValueError("Moment must be positive.") - mu, mu2, g1, g2 = None, None, None, None - if (n > 0) and (n < 5): - signature = inspect.getargspec(get_method_function(self._stats)) - if (signature[2] is not None) or ('moments' in signature[0]): - mdict = {'moments':{1:'m',2:'v',3:'vs',4:'vk'}[n]} - else: - mdict = {} - mu, mu2, g1, g2 = self._stats(*args,**mdict) - val = _moment_from_stats(n, mu, mu2, g1, g2, self._munp, args) - - # Convert to transformed X = L + S*Y - # so E[X^n] = E[(L+S*Y)^n] = L^n sum(comb(n,k)*(S/L)^k E[Y^k],k=0...n) - if loc == 0: - return scale**n * val - else: - result = 0 - fac = float(scale) / float(loc) - for k in range(n): - valk = _moment_from_stats(k, mu, mu2, g1, g2, self._munp, args) - result += comb(n,k,exact=True)*(fac**k) * valk - result += fac**n * val - return result * loc**n - - def link(self, x, logSF, theta, i): - ''' Return dist. par. no. i as function of quantile (x) and log survival probability (sf) - where - theta is the list containing all parameters including location and scale. - ''' - raise ValueError('Link function not implemented for the %s distribution' % self.name) - return None - - def _nnlf(self, x, *args): - return -sum(self._logpdf(x, *args),axis=0) - - def nnlf(self, theta, x): - '''Return negative loglikelihood function - - Notes - ----- - This is ``-sum(log pdf(x, theta), axis=0)`` where theta are the - parameters (including loc and scale). - ''' - try: - loc = theta[-2] - scale = theta[-1] - args = tuple(theta[:-2]) - except IndexError: - raise ValueError("Not enough input arguments.") - if not self._argcheck(*args) or scale <= 0: - return inf - x = asarray((x-loc) / scale) - cond0 = (x <= self.a) | (self.b <= x) - Nbad = sum(cond0) - loginf = log(floatinfo.machar.xmax) - if Nbad>0: - x = argsreduce(~cond0, x)[0] - - N = len(x) - return self._nnlf(x, *args) + N*log(scale) + Nbad * 100.0 * loginf - - def _penalized_nnlf(self, theta, x): - ''' Return negative loglikelihood function, - i.e., - sum (log pdf(x, theta),axis=0) - where theta are the parameters (including loc and scale) - ''' - try: - loc = theta[-2] - scale = theta[-1] - args = tuple(theta[:-2]) - except IndexError: - raise ValueError("Not enough input arguments.") - if not self._argcheck(*args) or scale <= 0: - return inf - x = asarray((x-loc) / scale) - - loginf = log(floatinfo.machar.xmax) - - if np.isneginf(self.a).all() and np.isinf(self.b).all(): - Nbad = 0 - else: - cond0 = (x <= self.a) | (self.b <= x) - Nbad = sum(cond0) - if Nbad > 0: - x = argsreduce(~cond0, x)[0] - - N = len(x) - return self._nnlf(x, *args) + N*log(scale) + Nbad * 100.0 * loginf - - def hessian_nnlf(self, theta, data, eps=None): - ''' approximate hessian of nnlf where theta are the parameters (including loc and scale) - ''' - #Nd = len(x) - np = len(theta) - # pab 07.01.2001: Always choose the stepsize h so that - # it is an exactly representable number. - # This is important when calculating numerical derivatives and is - # accomplished by the following. - - if eps == None: - eps = (floatinfo.machar.eps) ** 0.4 - #xmin = floatinfo.machar.xmin - #myfun = lambda y: max(y,100.0*log(xmin)) #% trick to avoid log of zero - delta = (eps + 2.0) - 2.0 - delta2 = delta ** 2.0 - # Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with - # 1/(d^2 L(theta|x)/dtheta^2) - # using central differences - - LL = self.nnlf(theta, data) - H = zeros((np, np)) #%% Hessian matrix - theta = tuple(theta) - for ix in xrange(np): - sparam = list(theta) - sparam[ix] = theta[ix] + delta - fp = self.nnlf(sparam, data) - #fp = sum(myfun(x)) - - sparam[ix] = theta[ix] - delta - fm = self.nnlf(sparam, data) - #fm = sum(myfun(x)) - - H[ix, ix] = (fp - 2 * LL + fm) / delta2 - for iy in range(ix + 1, np): - sparam[ix] = theta[ix] + delta - sparam[iy] = theta[iy] + delta - fpp = self.nnlf(sparam, data) - #fpp = sum(myfun(x)) - - sparam[iy] = theta[iy] - delta - fpm = self.nnlf(sparam, data) - #fpm = sum(myfun(x)) - - sparam[ix] = theta[ix] - delta - fmm = self.nnlf(sparam, data) - #fmm = sum(myfun(x)); - - sparam[iy] = theta[iy] + delta - fmp = self.nnlf(sparam, data) - #fmp = sum(myfun(x)) - H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2) - H[iy, ix] = H[ix, iy] - sparam[iy] = theta[iy]; - - # invert the Hessian matrix (i.e. invert the observed information number) - #pcov = -pinv(H); - return - H - - def nlogps(self, theta, x): - """ Moran's negative log Product Spacings statistic - - where theta are the parameters (including loc and scale) - - Note the data in x must be sorted - - References - ----------- - - R. C. H. Cheng; N. A. K. Amin (1983) - "Estimating Parameters in Continuous Univariate Distributions with a - Shifted Origin.", - Journal of the Royal Statistical Society. Series B (Methodological), - Vol. 45, No. 3. (1983), pp. 394-403. - - R. C. H. Cheng; M. A. Stephens (1989) - "A Goodness-Of-Fit Test Using Moran's Statistic with Estimated - Parameters", Biometrika, 76, 2, pp 385-392 - - Wong, T.S.T. and Li, W.K. (2006) - "A note on the estimation of extreme value distributions using maximum - product of spacings.", - IMS Lecture Notes Monograph Series 2006, Vol. 52, pp. 272-283 - """ - - try: - loc = theta[-2] - scale = theta[-1] - args = tuple(theta[:-2]) - except IndexError: - raise ValueError, "Not enough input arguments." - if not self._argcheck(*args) or scale <= 0: - return inf - x = asarray((x - loc) / scale) - cond0 = (x <= self.a) | (self.b <= x) - Nbad = sum(cond0) - if Nbad>0: - x = argsreduce( ~cond0, x)[0] - - - lowertail = True - if lowertail: - prb = numpy.hstack((0.0, self.cdf(x, *args), 1.0)) - dprb = numpy.diff(prb) - else: - prb = numpy.hstack((1.0, self.sf(x, *args), 0.0)) - dprb = -numpy.diff(prb) - - logD = log(dprb) - dx = numpy.diff(x, axis=0) - tie = (dx == 0) - if any(tie): - # TODO : implement this method for treating ties in data: - # Assume measuring error is delta. Then compute - # yL = F(xi-delta,theta) - # yU = F(xi+delta,theta) - # and replace - # logDj = log((yU-yL)/(r-1)) for j = i+1,i+2,...i+r-1 - - # The following is OK when only minimization of T is wanted - i_tie = nonzero(tie) - tiedata = x[i_tie] - logD[i_tie + 1] = log(self._pdf(tiedata, *args)) - log(scale) - - finiteD = numpy.isfinite(logD) - nonfiniteD = 1 - finiteD - Nbad += sum(nonfiniteD, axis=0) - if Nbad>0: - realmax = floatinfo.machar.xmax - T = -sum(logD[finiteD], axis=0) + 100.0 * log(realmax) * Nbad; - else: - T = -sum(logD, axis=0) #%Moran's negative log product spacing statistic - return T - - def hessian_nlogps(self, theta, data, eps=None): - ''' approximate hessian of nlogps where theta are the parameters (including loc and scale) - ''' - np = len(theta) - # pab 07.01.2001: Always choose the stepsize h so that - # it is an exactly representable number. - # This is important when calculating numerical derivatives and is - # accomplished by the following. - - if eps == None: - eps = (floatinfo.machar.eps) ** 0.4 - #xmin = floatinfo.machar.xmin - #myfun = lambda y: max(y,100.0*log(xmin)) #% trick to avoid log of zero - delta = (eps + 2.0) - 2.0 - delta2 = delta ** 2.0 - # Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with - # 1/(d^2 L(theta|x)/dtheta^2) - # using central differences - - LL = self.nlogps(theta, data) - H = zeros((np, np)) #%% Hessian matrix - theta = tuple(theta) - for ix in xrange(np): - sparam = list(theta) - sparam[ix] = theta[ix] + delta - fp = self.nlogps(sparam, data) - #fp = sum(myfun(x)) - - sparam[ix] = theta[ix] - delta - fm = self.nlogps(sparam, data) - #fm = sum(myfun(x)) - - H[ix, ix] = (fp - 2 * LL + fm) / delta2 - for iy in range(ix + 1, np): - sparam[ix] = theta[ix] + delta - sparam[iy] = theta[iy] + delta - fpp = self.nlogps(sparam, data) - #fpp = sum(myfun(x)) - - sparam[iy] = theta[iy] - delta - fpm = self.nlogps(sparam, data) - #fpm = sum(myfun(x)) - - sparam[ix] = theta[ix] - delta - fmm = self.nlogps(sparam, data) - #fmm = sum(myfun(x)); - - sparam[iy] = theta[iy] + delta - fmp = self.nlogps(sparam, data) - #fmp = sum(myfun(x)) - H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2) - H[iy, ix] = H[ix, iy] - sparam[iy] = theta[iy]; - - # invert the Hessian matrix (i.e. invert the observed information number) - #pcov = -pinv(H); - return - H - - # return starting point for fit (shape arguments + loc + scale) - def _fitstart(self, data, args=None): - if args is None: - args = (1.0,)*self.numargs - return args + self.fit_loc_scale(data, *args) - - # Return the (possibly reduced) function to optimize in order to find MLE - # estimates for the .fit method - def _reduce_func(self, args, kwds): - args = list(args) - Nargs = len(args) - fixedn = [] - index = list(range(Nargs)) - names = ['f%d' % n for n in range(Nargs - 2)] + ['floc', 'fscale'] - x0 = [] - for n, key in zip(index, names): - if key in kwds: - fixedn.append(n) - args[n] = kwds[key] - else: - x0.append(args[n]) - method = kwds.get('method', 'ml').lower() - if method.startswith('mps'): - fitfun = self.nlogps - else: - fitfun = self._penalized_nnlf - - if len(fixedn) == 0: - func = fitfun - restore = None - else: - if len(fixedn) == len(index): - raise ValueError("All parameters fixed. There is nothing to optimize.") - - def restore(args, theta): - # Replace with theta for all numbers not in fixedn - # This allows the non-fixed values to vary, but - # we still call self.nnlf with all parameters. - i = 0 - for n in range(Nargs): - if n not in fixedn: - args[n] = theta[i] - i += 1 - return args - - def func(theta, x): - newtheta = restore(args[:], theta) - return fitfun(newtheta, x) - - return x0, func, restore, args - - def fit(self, data, *args, **kwds): - """ - Return MLEs for shape, location, and scale parameters from data. - - MLE stands for Maximum Likelihood Estimate. Starting estimates for - the fit are given by input arguments; for any arguments not provided - with starting estimates, ``self._fitstart(data)`` is called to generate - such. - - One can hold some parameters fixed to specific values by passing in - keyword arguments ``f0``, ``f1``, ..., ``fn`` (for shape parameters) - and ``floc`` and ``fscale`` (for location and scale parameters, - respectively). - - Parameters - ---------- - data : array_like - Data to use in calculating the MLEs. - args : floats, optional - Starting value(s) for any shape-characterizing arguments (those not - provided will be determined by a call to ``_fitstart(data)``). - No default value. - kwds : floats, optional - Starting values for the location and scale parameters; no default. - Special keyword arguments are recognized as holding certain - parameters fixed: - - f0...fn : hold respective shape parameters fixed. - - floc : hold location parameter fixed to specified value. - - fscale : hold scale parameter fixed to specified value. - - optimizer : The optimizer to use. The optimizer must take func, - and starting position as the first two arguments, - plus args (for extra arguments to pass to the - function to be optimized) and disp=0 to suppress - output as keyword arguments. - - Returns - ------- - shape, loc, scale : tuple of floats - MLEs for any shape statistics, followed by those for location and - scale. - - Notes - ----- - This fit is computed by maximizing a log-likelihood function, with - penalty applied for samples outside of range of the distribution. The - returned answer is not guaranteed to be the globally optimal MLE, it - may only be locally optimal, or the optimization may fail altogether. - """ - Narg = len(args) - if Narg > self.numargs: - raise TypeError("Too many input arguments.") - - start = [None]*2 - if (Narg < self.numargs) or not ('loc' in kwds and - 'scale' in kwds): - start = self._fitstart(data) # get distribution specific starting locations - args += start[Narg:-2] - loc = kwds.get('loc', start[-2]) - scale = kwds.get('scale', start[-1]) - args += (loc, scale) - x0, func, restore, args = self._reduce_func(args, kwds) - - optimizer = kwds.get('optimizer', optimize.fmin) - # convert string to function in scipy.optimize - if not callable(optimizer) and isinstance(optimizer, string_types): - if not optimizer.startswith('fmin_'): - optimizer = "fmin_"+optimizer - if optimizer == 'fmin_': - optimizer = 'fmin' - try: - optimizer = getattr(optimize, optimizer) - except AttributeError: - raise ValueError("%s is not a valid optimizer" % optimizer) - vals = optimizer(func,x0,args=(ravel(data),),disp=0) - if restore is not None: - vals = restore(args, vals) - vals = tuple(vals) - return vals - - def fit2(self, data, *args, **kwds): - ''' Return Maximum Likelihood or Maximum Product Spacing estimator object - - Parameters - ---------- - data : array-like - Data to use in calculating the ML or MPS estimators - args : optional - Starting values for any shape arguments (those not specified - will be determined by dist._fitstart(data)) - kwds : loc, scale - Starting values for the location and scale parameters - Special keyword arguments are recognized as holding certain - parameters fixed: - f0..fn : hold respective shape paramters fixed - floc : hold location parameter fixed to specified value - fscale : hold scale parameter fixed to specified value - method : of estimation. Options are - 'ml' : Maximum Likelihood method (default) - 'mps': Maximum Product Spacing method - alpha : scalar, optional - Confidence coefficent (default=0.05) - search : bool - If true search for best estimator (default), - otherwise return object with initial distribution parameters - copydata : bool - If true copydata (default) - optimizer : The optimizer to use. The optimizer must take func, - and starting position as the first two arguments, - plus args (for extra arguments to pass to the - function to be optimized) and disp=0 to suppress - output as keyword arguments. - - Return - ------ - phat : FitDistribution object - Fitted distribution object with following member variables: - LLmax : loglikelihood function evaluated using par - LPSmax : log product spacing function evaluated using par - pvalue : p-value for the fit - par : distribution parameters (fixed and fitted) - par_cov : covariance of distribution parameters - par_fix : fixed distribution parameters - par_lower : lower (1-alpha)% confidence bound for the parameters - par_upper : upper (1-alpha)% confidence bound for the parameters - - Note - ---- - `data` is sorted using this function, so if `copydata`==False the data - in your namespace will be sorted as well. - ''' - return FitDistribution(self, data, *args, **kwds) - - def fit_loc_scale(self, data, *args): - """ - Estimate loc and scale parameters from data using 1st and 2nd moments. - - Parameters - ---------- - data : array_like - Data to fit. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - - Returns - ------- - Lhat : float - Estimated location parameter for the data. - Shat : float - Estimated scale parameter for the data. - - """ - mu, mu2 = self.stats(*args,**{'moments':'mv'}) - tmp = asarray(data) - muhat = tmp.mean() - mu2hat = tmp.var() - Shat = sqrt(mu2hat / mu2) - Lhat = muhat - Shat*mu - if not np.isfinite(Lhat): - Lhat = 0 - if not (np.isfinite(Shat) and (0 < Shat)): - Shat = 1 - return Lhat, Shat - - @np.deprecate - def est_loc_scale(self, data, *args): - """This function is deprecated, use self.fit_loc_scale(data) instead.""" - return self.fit_loc_scale(data, *args) - - def freeze(self,*args,**kwds): - """Freeze the distribution for the given arguments. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution. Should include all - the non-optional arguments, may include ``loc`` and ``scale``. - - Returns - ------- - rv_frozen : rv_frozen instance - The frozen distribution. - - """ - return rv_frozen(self,*args,**kwds) - - def __call__(self, *args, **kwds): - return self.freeze(*args, **kwds) - - def _entropy(self, *args): - def integ(x): - val = self._pdf(x, *args) - return special.xlogy(val, val) - - entr = -integrate.quad(integ,self.a,self.b)[0] - if not np.isnan(entr): - return entr - else: # try with different limits if integration problems - low,upp = self.ppf([0.001,0.999],*args) - if np.isinf(self.b): - upper = upp - else: - upper = self.b - if np.isinf(self.a): - lower = low - else: - lower = self.a - return -integrate.quad(integ,lower,upper)[0] - - def entropy(self, *args, **kwds): - """ - Differential entropy of the RV. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - scale : array_like, optional - Scale parameter (default=1). - - """ - args, loc, scale = self._parse_args(*args, **kwds) - args = tuple(map(asarray,args)) - cond0 = self._argcheck(*args) & (scale > 0) & (loc == loc) - output = zeros(shape(cond0),'d') - place(output,(1-cond0),self.badvalue) - goodargs = argsreduce(cond0, *args) - # np.vectorize doesn't work when numargs == 0 in numpy 1.5.1 - if self.numargs == 0: - place(output,cond0,self._entropy()+log(scale)) - else: - place(output,cond0,self.vecentropy(*goodargs)+log(scale)) - - return output - - def expect(self, func=None, args=(), loc=0, scale=1, lb=None, ub=None, - conditional=False, **kwds): - """Calculate expected value of a function with respect to the distribution - - The expected value of a function ``f(x)`` with respect to a - distribution ``dist`` is defined as:: - - ubound - E[x] = Integral(f(x) * dist.pdf(x)) - lbound - - Parameters - ---------- - func : callable, optional - Function for which integral is calculated. Takes only one argument. - The default is the identity mapping f(x) = x. - args : tuple, optional - Argument (parameters) of the distribution. - lb, ub : scalar, optional - Lower and upper bound for integration. default is set to the support - of the distribution. - conditional : bool, optional - If True, the integral is corrected by the conditional probability - of the integration interval. The return value is the expectation - of the function, conditional on being in the given interval. - Default is False. - - Additional keyword arguments are passed to the integration routine. - - Returns - ------- - expect : float - The calculated expected value. - - Notes - ----- - The integration behavior of this function is inherited from - `integrate.quad`. - - """ - lockwds = {'loc': loc, - 'scale':scale} - self._argcheck(*args) - if func is None: - def fun(x, *args): - return x * self.pdf(x, *args, **lockwds) - else: - def fun(x, *args): - return func(x) * self.pdf(x, *args, **lockwds) - if lb is None: - lb = loc + self.a * scale - if ub is None: - ub = loc + self.b * scale - if conditional: - invfac = (self.sf(lb, *args, **lockwds) - - self.sf(ub, *args, **lockwds)) - else: - invfac = 1.0 - kwds['args'] = args - return integrate.quad(fun, lb, ub, **kwds)[0] / invfac - - -_EULER = 0.577215664901532860606512090082402431042 # -special.psi(1) -_ZETA3 = 1.202056903159594285399738161511449990765 # special.zeta(3,1) Apery's constant - - -## Kolmogorov-Smirnov one-sided and two-sided test statistics -class ksone_gen(rv_continuous): - """General Kolmogorov-Smirnov one-sided test. - - %(default)s - - """ - def _cdf(self, x, n): - return 1.0 - special.smirnov(n, x) - - def _ppf(self, q, n): - return special.smirnovi(n, 1.0 - q) -ksone = ksone_gen(a=0.0, name='ksone') - - -class kstwobign_gen(rv_continuous): - """Kolmogorov-Smirnov two-sided test for large N. - - %(default)s - - """ - def _cdf(self,x): - return 1.0-special.kolmogorov(x) - - def _sf(self,x): - return special.kolmogorov(x) - - def _ppf(self,q): - return special.kolmogi(1.0-q) -kstwobign = kstwobign_gen(a=0.0, name='kstwobign') - - -## Normal distribution - -# loc = mu, scale = std -# Keep these implementations out of the class definition so they can be reused -# by other distributions. -_norm_pdf_C = math.sqrt(2*pi) -_norm_pdf_logC = math.log(_norm_pdf_C) - - -def _norm_pdf(x): - return exp(-x**2/2.0) / _norm_pdf_C - - -def _norm_logpdf(x): - return -x**2 / 2.0 - _norm_pdf_logC - - -def _norm_cdf(x): - return special.ndtr(x) - - -def _norm_logcdf(x): - return special.log_ndtr(x) - - -def _norm_ppf(q): - return special.ndtri(q) - - -def _norm_sf(x): - return special.ndtr(-x) - - -def _norm_logsf(x): - return special.log_ndtr(-x) - - -def _norm_isf(q): - return -special.ndtri(q) - - -class norm_gen(rv_continuous): - """A normal continuous random variable. - - The location (loc) keyword specifies the mean. - The scale (scale) keyword specifies the standard deviation. - - %(before_notes)s - - Notes - ----- - The probability density function for `norm` is:: - - norm.pdf(x) = exp(-x**2/2)/sqrt(2*pi) - - %(example)s - - """ - def _rvs(self): - return mtrand.standard_normal(self._size) - - def _pdf(self,x): - return _norm_pdf(x) - - def _logpdf(self, x): - return _norm_logpdf(x) - - def _cdf(self,x): - return _norm_cdf(x) - - def _logcdf(self, x): - return _norm_logcdf(x) - - def _sf(self, x): - return _norm_sf(x) - - def _logsf(self, x): - return _norm_logsf(x) - - def _ppf(self,q): - return _norm_ppf(q) - - def _isf(self,q): - return _norm_isf(q) - - def _stats(self): - return 0.0, 1.0, 0.0, 0.0 - - def _entropy(self): - return 0.5*(log(2*pi)+1) - - @inherit_docstring_from(rv_continuous) - def fit(self, data, **kwds): - """%(super)s - This function (norm_gen.fit) uses explicit formulas for the maximum - likelihood estimation of the parameters, so the `optimizer` argument - is ignored. - """ - floc = kwds.get('floc', None) - fscale = kwds.get('fscale', None) - - if floc is not None and fscale is not None: - # This check is for consistency with `rv_continuous.fit`. - # Without this check, this function would just return the - # parameters that were given. - raise ValueError("All parameters fixed. There is nothing to " - "optimize.") - - data = np.asarray(data) - - if floc is None: - loc = data.mean() - else: - loc = floc - - if fscale is None: - scale = np.sqrt(((data - loc)**2).mean()) - else: - scale = fscale - - return loc, scale - -norm = norm_gen(name='norm') - - -class alpha_gen(rv_continuous): - """An alpha continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `alpha` is:: - - alpha.pdf(x,a) = 1/(x**2*Phi(a)*sqrt(2*pi)) * exp(-1/2 * (a-1/x)**2), - - where ``Phi(alpha)`` is the normal CDF, ``x > 0``, and ``a > 0``. - - %(example)s - - """ - def _pdf(self, x, a): - return 1.0/(x**2)/special.ndtr(a)*_norm_pdf(a-1.0/x) - - def _logpdf(self, x, a): - return -2*log(x) + _norm_logpdf(a-1.0/x) - log(special.ndtr(a)) - - def _cdf(self, x, a): - return special.ndtr(a-1.0/x) / special.ndtr(a) - - def _ppf(self, q, a): - return 1.0/asarray(a-special.ndtri(q*special.ndtr(a))) - - def _stats(self, a): - return [inf]*2 + [nan]*2 -alpha = alpha_gen(a=0.0, name='alpha') - - -class anglit_gen(rv_continuous): - """An anglit continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `anglit` is:: - - anglit.pdf(x) = sin(2*x + pi/2) = cos(2*x), - - for ``-pi/4 <= x <= pi/4``. - - %(example)s - - """ - def _pdf(self, x): - return cos(2*x) - - def _cdf(self, x): - return sin(x+pi/4)**2.0 - - def _ppf(self, q): - return (arcsin(sqrt(q))-pi/4) - - def _stats(self): - return 0.0, pi*pi/16-0.5, 0.0, -2*(pi**4 - 96)/(pi*pi-8)**2 - - def _entropy(self): - return 1-log(2) -anglit = anglit_gen(a=-pi/4, b=pi/4, name='anglit') - - -class arcsine_gen(rv_continuous): - """An arcsine continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `arcsine` is:: - - arcsine.pdf(x) = 1/(pi*sqrt(x*(1-x))) - for 0 < x < 1. - - %(example)s - - """ - def _pdf(self, x): - return 1.0/pi/sqrt(x*(1-x)) - - def _cdf(self, x): - return 2.0/pi*arcsin(sqrt(x)) - - def _ppf(self, q): - return sin(pi/2.0*q)**2.0 - - def _stats(self): - mu = 0.5 - mu2 = 1.0/8 - g1 = 0 - g2 = -3.0/2.0 - return mu, mu2, g1, g2 - - def _entropy(self): - return -0.24156447527049044468 -arcsine = arcsine_gen(a=0.0, b=1.0, name='arcsine') - - -class FitDataError(ValueError): - # This exception is raised by, for example, beta_gen.fit when both floc - # and fscale are fixed and there are values in the data not in the open - # interval (floc, floc+fscale). - def __init__(self, distr, lower, upper): - self.args = ("Invalid values in `data`. Maximum likelihood " - "estimation with {distr!r} requires that {lower!r} < x " - "< {upper!r} for each x in `data`.".format(distr=distr, - lower=lower, upper=upper),) - - -class FitSolverError(RuntimeError): - # This exception is raised by, for example, beta_gen.fit when - # optimize.fsolve returns with ier != 1. - def __init__(self, mesg): - emsg = "Solver for the MLE equations failed to converge: " - emsg += mesg.replace('\n', '') - self.args = (emsg,) - - -def _beta_mle_a(a, b, n, s1): - # The zeros of this function give the MLE for `a`, with - # `b`, `n` and `s1` given. `s1` is the sum of the logs of - # the data. `n` is the number of data points. - psiab = special.psi(a + b) - func = s1 - n * (-psiab + special.psi(a)) - return func - - -def _beta_mle_ab(theta, n, s1, s2): - # Zeros of this function are critical points of - # the maximum likelihood function. Solving this system - # for theta (which contains a and b) gives the MLE for a and b - # given `n`, `s1` and `s2`. `s1` is the sum of the logs of the data, - # and `s2` is the sum of the logs of 1 - data. `n` is the number - # of data points. - a, b = theta - psiab = special.psi(a + b) - func = [s1 - n * (-psiab + special.psi(a)), - s2 - n * (-psiab + special.psi(b))] - return func - - -class beta_gen(rv_continuous): - """A beta continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `beta` is:: - - beta.pdf(x, a, b) = gamma(a+b)/(gamma(a)*gamma(b)) * x**(a-1) * - (1-x)**(b-1), - - for ``0 < x < 1``, ``a > 0``, ``b > 0``. - - %(example)s - - """ - def _rvs(self, a, b): - return mtrand.beta(a,b,self._size) - - def _pdf(self, x, a, b): - return np.exp(self._logpdf(x, a, b)) - - def _logpdf(self, x, a, b): - lPx = special.xlog1py(b-1.0, -x) + special.xlogy(a-1.0, x) - lPx -= special.betaln(a,b) - return lPx - - def _cdf(self, x, a, b): - return special.btdtr(a,b,x) - - def _ppf(self, q, a, b): - return special.btdtri(a,b,q) - - def _stats(self, a, b): - mn = a*1.0 / (a + b) - var = (a*b*1.0)/(a+b+1.0)/(a+b)**2.0 - g1 = 2.0*(b-a)*sqrt((1.0+a+b)/(a*b)) / (2+a+b) - g2 = 6.0*(a**3 + a**2*(1-2*b) + b**2*(1+b) - 2*a*b*(2+b)) - g2 /= a*b*(a+b+2)*(a+b+3) - return mn, var, g1, g2 - - def _fitstart(self, data): - g1 = _skew(data) - g2 = _kurtosis(data) - - def func(x): - a, b = x - sk = 2*(b-a)*sqrt(a + b + 1) / (a + b + 2) / sqrt(a*b) - ku = a**3 - a**2*(2*b-1) + b**2*(b+1) - 2*a*b*(b+2) - ku /= a*b*(a+b+2)*(a+b+3) - ku *= 6 - return [sk-g1, ku-g2] - a, b = optimize.fsolve(func, (1.0, 1.0)) - return super(beta_gen, self)._fitstart(data, args=(a,b)) - - @inherit_docstring_from(rv_continuous) - def fit(self, data, *args, **kwds): - """%(super)s - In the special case where both `floc` and `fscale` are given, a - `ValueError` is raised if any value `x` in `data` does not satisfy - `floc < x < floc + fscale`. - """ - # Override rv_continuous.fit, so we can more efficiently handle the - # case where floc and fscale are given. - - f0 = kwds.get('f0', None) - f1 = kwds.get('f1', None) - floc = kwds.get('floc', None) - fscale = kwds.get('fscale', None) - - if floc is None or fscale is None: - # do general fit - return super(beta_gen, self).fit(data, *args, **kwds) - - if f0 is not None and f1 is not None: - # This check is for consistency with `rv_continuous.fit`. - raise ValueError("All parameters fixed. There is nothing to " - "optimize.") - - # Special case: loc and scale are constrained, so we are fitting - # just the shape parameters. This can be done much more efficiently - # than the method used in `rv_continuous.fit`. (See the subsection - # "Two unknown parameters" in the section "Maximum likelihood" of - # the Wikipedia article on the Beta distribution for the formulas.) - - # Normalize the data to the interval [0,1]. - data = (ravel(data) - floc) / fscale - if np.any(data <= 0) or np.any(data >= 1): - raise FitDataError("beta", lower=floc, upper=floc + fscale) - xbar = data.mean() - - if f0 is not None or f1 is not None: - # One of the shape parameters is fixed. - - if f0 is not None: - # The shape parameter a is fixed, so swap the parameters - # and flip the data. We always solve for `a`. The result - # will be swapped back before returning. - b = f0 - data = 1 - data - xbar = 1 - xbar - else: - b = f1 - - # Initial guess for a. Use the formula for the mean of the beta - # distribution, E[x] = a / (a + b), to generate a reasonable - # starting point based on the mean of the data and the given - # value of b. - a = b * xbar / (1 - xbar) - - # Compute the MLE for `a` by solving _beta_mle_a. - theta, info, ier, mesg = optimize.fsolve(_beta_mle_a, a, - args=(b, len(data), np.log(data).sum()), full_output=True) - if ier != 1: - raise FitSolverError(mesg=mesg) - a = theta[0] - - if f0 is not None: - # The shape parameter a was fixed, so swap back the - # parameters. - a, b = b, a - - else: - # Neither of the shape parameters is fixed. - - # s1 and s2 are used in the extra arguments passed to _beta_mle_ab - # by optimize.fsolve. - s1 = np.log(data).sum() - s2 = np.log(1 - data).sum() - - # Use the "method of moments" to estimate the initial - # guess for a and b. - fac = xbar * (1 - xbar) / data.var(ddof=0) - 1 - a = xbar * fac - b = (1 - xbar) * fac - - # Compute the MLE for a and b by solving _beta_mle_ab. - theta, info, ier, mesg = optimize.fsolve(_beta_mle_ab, [a, b], - args=(len(data), s1, s2), full_output=True) - if ier != 1: - raise FitSolverError(mesg=mesg) - a, b = theta - - return a, b, floc, fscale - -beta = beta_gen(a=0.0, b=1.0, name='beta') - - -class betaprime_gen(rv_continuous): - """A beta prime continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `betaprime` is:: - - betaprime.pdf(x, a, b) = x**(a-1) * (1+x)**(-a-b) / beta(a, b) - - for ``x > 0``, ``a > 0``, ``b > 0``, where ``beta(a, b)`` is the beta - function (see `scipy.special.beta`). - - %(example)s - - """ - def _rvs(self, a, b): - u1 = gamma.rvs(a,size=self._size) - u2 = gamma.rvs(b,size=self._size) - return (u1 / u2) - - def _pdf(self, x, a, b): - return np.exp(self._logpdf(x, a, b)) - - def _logpdf(self, x, a, b): - return special.xlogy(a-1.0, x) - special.xlog1py(a+b, x) - special.betaln(a,b) - - def _cdf_skip(self, x, a, b): - # remove for now: special.hyp2f1 is incorrect for large a - x = where(x == 1.0, 1.0-1e-6,x) - return pow(x,a)*special.hyp2f1(a+b,a,1+a,-x)/a/special.beta(a,b) - - def _munp(self, n, a, b): - if (n == 1.0): - return where(b > 1, a/(b-1.0), inf) - elif (n == 2.0): - return where(b > 2, a*(a+1.0)/((b-2.0)*(b-1.0)), inf) - elif (n == 3.0): - return where(b > 3, a*(a+1.0)*(a+2.0)/((b-3.0)*(b-2.0)*(b-1.0)), - inf) - elif (n == 4.0): - return where(b > 4, - a*(a+1.0)*(a+2.0)*(a+3.0)/((b-4.0)*(b-3.0) - * (b-2.0)*(b-1.0)), inf) - else: - raise NotImplementedError -betaprime = betaprime_gen(a=0.0, b=500.0, name='betaprime') - - -class bradford_gen(rv_continuous): - """A Bradford continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `bradford` is:: - - bradford.pdf(x, c) = c / (k * (1+c*x)), - - for ``0 < x < 1``, ``c > 0`` and ``k = log(1+c)``. - - %(example)s - - """ - def _pdf(self, x, c): - return c / (c*x + 1.0) / log1p(c) - - def _cdf(self, x, c): - return log1p(c*x) / log1p(c) - - def _ppf(self, q, c): - return ((1.0+c)**q-1)/c - - def _stats(self, c, moments='mv'): - k = log1p(c) - mu = (c-k)/(c*k) - mu2 = ((c+2.0)*k-2.0*c)/(2*c*k*k) - g1 = None - g2 = None - if 's' in moments: - g1 = sqrt(2)*(12*c*c-9*c*k*(c+2)+2*k*k*(c*(c+3)+3)) - g1 /= sqrt(c*(c*(k-2)+2*k))*(3*c*(k-2)+6*k) - if 'k' in moments: - g2 = c**3*(k-3)*(k*(3*k-16)+24)+12*k*c*c*(k-4)*(k-3) \ - + 6*c*k*k*(3*k-14) + 12*k**3 - g2 /= 3*c*(c*(k-2)+2*k)**2 - return mu, mu2, g1, g2 - - def _entropy(self, c): - k = log1p(c) - return k/2.0 - log(c/k) - def _fitstart(self, data): - loc = data.min()-1e-4 - scale = (data-loc).max() - m = np.mean((data-loc)/scale) - fun = lambda c : (c-log1p(c))/(c*log1p(c)) - m - res = optimize.root(fun, 0.3) - c = res.x - return c, loc, scale -bradford = bradford_gen(a=0.0, b=1.0, name='bradford') - - -class burr_gen(rv_continuous): - """A Burr continuous random variable. - - %(before_notes)s - - See Also - -------- - fisk : a special case of `burr` with ``d = 1`` - - Notes - ----- - The probability density function for `burr` is:: - - burr.pdf(x, c, d) = c * d * x**(-c-1) * (1+x**(-c))**(-d-1) - - for ``x > 0``. - - %(example)s - - """ - def _pdf(self, x, c, d): - return c*d*(x**(-c-1.0))*((1+x**(-c*1.0))**(-d-1.0)) - - def _cdf(self, x, c, d): - return (1+x**(-c*1.0))**(-d**1.0) - - def _ppf(self, q, c, d): - return (q**(-1.0/d)-1)**(-1.0/c) - - def _stats(self, c, d, moments='mv'): - g2c, g2cd = gam(1-2.0/c), gam(2.0/c+d) - g1c, g1cd = gam(1-1.0/c), gam(1.0/c+d) - gd = gam(d) - k = gd*g2c*g2cd - g1c**2 * g1cd**2 - mu = g1c*g1cd / gd - mu2 = k / gd**2.0 - g1, g2 = None, None - g3c, g3cd = None, None - if 's' in moments: - g3c, g3cd = gam(1-3.0/c), gam(3.0/c+d) - g1 = 2*g1c**3 * g1cd**3 + gd*gd*g3c*g3cd - 3*gd*g2c*g1c*g1cd*g2cd - g1 /= sqrt(k**3) - if 'k' in moments: - if g3c is None: - g3c = gam(1-3.0/c) - if g3cd is None: - g3cd = gam(3.0/c+d) - g4c, g4cd = gam(1-4.0/c), gam(4.0/c+d) - g2 = 6*gd*g2c*g2cd * g1c**2 * g1cd**2 + gd**3 * g4c*g4cd - g2 -= 3*g1c**4 * g1cd**4 - 4*gd**2*g3c*g1c*g1cd*g3cd - return mu, mu2, g1, g2 -burr = burr_gen(a=0.0, name='burr') - -#XXX: cf PR #2552 -class fisk_gen(burr_gen): - """A Fisk continuous random variable. - - The Fisk distribution is also known as the log-logistic distribution, and - equals the Burr distribution with ``d == 1``. - - %(before_notes)s - - See Also - -------- - burr - - %(example)s - - """ - def _pdf(self, x, c): - return burr_gen._pdf(self, x, c, 1.0) - - def _cdf(self, x, c): - return burr_gen._cdf(self, x, c, 1.0) - - def _ppf(self, x, c): - return burr_gen._ppf(self, x, c, 1.0) - - def _stats(self, c): - return burr_gen._stats(self, c, 1.0) - - def _entropy(self, c): - return 2 - log(c) -fisk = fisk_gen(a=0.0, name='fisk') - - -# median = loc -class cauchy_gen(rv_continuous): - """A Cauchy continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `cauchy` is:: - - cauchy.pdf(x) = 1 / (pi * (1 + x**2)) - - %(example)s - - """ - def _pdf(self, x): - return 1.0/pi/(1.0+x*x) - - def _cdf(self, x): - return 0.5 + 1.0/pi*arctan(x) - - def _ppf(self, q): - return tan(pi*q-pi/2.0) - - def _sf(self, x): - return 0.5 - 1.0/pi*arctan(x) - - def _isf(self, q): - return tan(pi/2.0-pi*q) - - def _stats(self): - return inf, inf, nan, nan - - def _entropy(self): - return log(4*pi) - - def _fitstart(self, data, args=None): - return (0, 1) -cauchy = cauchy_gen(name='cauchy') - - -class chi_gen(rv_continuous): - """A chi continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `chi` is:: - - chi.pdf(x,df) = x**(df-1) * exp(-x**2/2) / (2**(df/2-1) * gamma(df/2)) - - for ``x > 0``. - - Special cases of `chi` are: - - - ``chi(1, loc, scale) = `halfnormal` - - ``chi(2, 0, scale) = `rayleigh` - - ``chi(3, 0, scale) : `maxwell` - - %(example)s - - """ - def _rvs(self, df): - return sqrt(chi2.rvs(df,size=self._size)) - - def _pdf(self, x, df): - return x**(df-1.)*exp(-x*x*0.5)/(2.0)**(df*0.5-1)/gam(df*0.5) - - def _cdf(self, x, df): - return special.gammainc(df*0.5,0.5*x*x) - - def _ppf(self, q, df): - return sqrt(2*special.gammaincinv(df*0.5,q)) - - def _stats(self, df): - mu = sqrt(2)*special.gamma(df/2.0+0.5)/special.gamma(df/2.0) - mu2 = df - mu*mu - g1 = (2*mu**3.0 + mu*(1-2*df))/asarray(np.power(mu2, 1.5)) - g2 = 2*df*(1.0-df)-6*mu**4 + 4*mu**2 * (2*df-1) - g2 /= asarray(mu2**2.0) - return mu, mu2, g1, g2 - def _fitstart(self, data): - m = data.mean() - v = data.var() - # Supply a starting guess with method of moments: - df = max(np.round(v+m**2),1) - return super(chi_gen, self)._fitstart(data, args=(df,)) -chi = chi_gen(a=0.0, name='chi') - - -## Chi-squared (gamma-distributed with loc=0 and scale=2 and shape=df/2) -class chi2_gen(rv_continuous): - """A chi-squared continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `chi2` is:: - - chi2.pdf(x,df) = 1 / (2*gamma(df/2)) * (x/2)**(df/2-1) * exp(-x/2) - - %(example)s - - """ - def _rvs(self, df): - return mtrand.chisquare(df,self._size) - - def _pdf(self, x, df): - return exp(self._logpdf(x, df)) - - def _logpdf(self, x, df): - # term1 = (df/2.-1)*log(x) - # term1[(df==2)*(x==0)] = 0 - # avoid 0*log(0)==nan - return (df/2.-1)*log(x+1e-300) - x/2. - gamln(df/2.) - (log(2)*df)/2. - - def _cdf(self, x, df): - return special.chdtr(df, x) - - def _sf(self, x, df): - return special.chdtrc(df, x) - - def _isf(self, p, df): - return special.chdtri(df, p) - - def _ppf(self, p, df): - return self._isf(1.0-p, df) - - def _stats(self, df): - mu = df - mu2 = 2*df - g1 = 2*sqrt(2.0/df) - g2 = 12.0/df - return mu, mu2, g1, g2 - def _fitstart(self, data): - m = data.mean() - v = data.var() - # Supply a starting guess with method of moments: - df = max(np.round((m+v/2)/2),1) - return super(chi2_gen, self)._fitstart(data, args=(df,)) -chi2 = chi2_gen(a=0.0, name='chi2') - - -class cosine_gen(rv_continuous): - """A cosine continuous random variable. - - %(before_notes)s - - Notes - ----- - The cosine distribution is an approximation to the normal distribution. - The probability density function for `cosine` is:: - - cosine.pdf(x) = 1/(2*pi) * (1+cos(x)) - - for ``-pi <= x <= pi``. - - %(example)s - - """ - def _pdf(self, x): - return 1.0/2/pi*(1+cos(x)) - - def _cdf(self, x): - return 1.0/2/pi*(pi + x + sin(x)) - - def _stats(self): - return 0.0, pi*pi/3.0-2.0, 0.0, -6.0*(pi**4-90)/(5.0*(pi*pi-6)**2) - - def _entropy(self): - return log(4*pi)-1.0 -cosine = cosine_gen(a=-pi, b=pi, name='cosine') - - -class dgamma_gen(rv_continuous): - """A double gamma continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `dgamma` is:: - - dgamma.pdf(x, a) = 1 / (2*gamma(a)) * abs(x)**(a-1) * exp(-abs(x)) - - for ``a > 0``. - - %(example)s - - """ - def _rvs(self, a): - u = random(size=self._size) - return (gamma.rvs(a,size=self._size)*where(u >= 0.5,1,-1)) - - def _pdf(self, x, a): - ax = abs(x) - return 1.0/(2*special.gamma(a))*ax**(a-1.0) * exp(-ax) - - def _logpdf(self, x, a): - ax = abs(x) - return (a-1.0)*log(ax) - ax - log(2) - gamln(a) - - def _cdf(self, x, a): - fac = 0.5*special.gammainc(a,abs(x)) - return where(x > 0,0.5+fac,0.5-fac) - - def _sf(self, x, a): - fac = 0.5*special.gammainc(a,abs(x)) - return where(x > 0,0.5-fac,0.5+fac) - - def _ppf(self, q, a): - fac = special.gammainccinv(a,1-abs(2*q-1)) - return where(q > 0.5, fac, -fac) - - def _stats(self, a): - mu2 = a*(a+1.0) - return 0.0, mu2, 0.0, (a+2.0)*(a+3.0)/mu2-3.0 -dgamma = dgamma_gen(name='dgamma') - - -class dweibull_gen(rv_continuous): - """A double Weibull continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `dweibull` is:: - - dweibull.pdf(x, c) = c / 2 * abs(x)**(c-1) * exp(-abs(x)**c) - - %(example)s - - """ - def _rvs(self, c): - u = random(size=self._size) - return weibull_min.rvs(c, size=self._size)*(where(u >= 0.5,1,-1)) - - def _pdf(self, x, c): - ax = abs(x) - Px = c/2.0*ax**(c-1.0)*exp(-ax**c) - return Px - - def _logpdf(self, x, c): - ax = abs(x) - return log(c) - log(2.0) + (c-1.0)*log(ax) - ax**c - - def _cdf(self, x, c): - Cx1 = 0.5*exp(-abs(x)**c) - return where(x > 0, 1-Cx1, Cx1) - - def _ppf_skip(self, q, c): - fac = where(q <= 0.5,2*q,2*q-1) - fac = pow(asarray(log(1.0/fac)),1.0/c) - return where(q > 0.5,fac,-fac) - - def _stats(self, c): - var = gam(1+2.0/c) - return 0.0, var, 0.0, gam(1+4.0/c)/var -dweibull = dweibull_gen(name='dweibull') - - -## Exponential (gamma distributed with a=1.0, loc=loc and scale=scale) -class expon_gen(rv_continuous): - """An exponential continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `expon` is:: - - expon.pdf(x) = lambda * exp(- lambda*x) - - for ``x >= 0``. - - The scale parameter is equal to ``scale = 1.0 / lambda``. - - `expon` does not have shape parameters. - - %(example)s - - """ - def link(self, x, logSF, phat, ix): - ''' Link for x,SF and parameters of Exponential distribution - - CALL phati = expon.link(x,logSF,phat,i) - - phati = parameter i as function of x, logSF and phat(j) where j ~= i - x = quantile - logSF = logarithm of the survival probability - - LINK is a function connecting the quantile (x) and the survival - probability (R) with the fixed distribution parameter, i.e.: - phat(i) = link(x,logSF,phat,i), - where logSF = log(Prob(X>x;phat)). - - Example % See proflog - - See also profile - ''' - if ix == 1: - return - (x - phat[0]) / logSF - elif ix == 0: - return x + phat[1] * logSF - - - def _rvs(self): - return mtrand.standard_exponential(self._size) - - def _pdf(self, x): - return exp(-x) - - def _logpdf(self, x): - return -x - - def _cdf(self, x): - return -expm1(-x) - - def _ppf(self, q): - return -log1p(-q) - - def _sf(self,x): - return exp(-x) - - def _logsf(self, x): - return -x - - def _isf(self,q): - return -log(q) - - def _stats(self): - return 1.0, 1.0, 2.0, 6.0 - - def _entropy(self): - return 1.0 -expon = expon_gen(a=0.0, name='expon') - - -class exponweib_gen(rv_continuous): - """An exponentiated Weibull continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `exponweib` is:: - - exponweib.pdf(x, a, c) = - a * c * (1-exp(-x**c))**(a-1) * exp(-x**c)*x**(c-1) - - for ``x > 0``, ``a > 0``, ``c > 0``. - - %(example)s - - """ - def _pdf(self, x, a, c): - exc = exp(-x**c) - return a*c*(1-exc)**asarray(a-1) * exc * x**(c-1) - - def _logpdf(self, x, a, c): - exc = exp(-x**c) - return log(a) + log(c) + (a-1.)*log1p(-exc) - x**c + (c-1.0)*log(x) - - def _cdf(self, x, a, c): - exm1c = -expm1(-x**c) - return (exm1c)**a - - def _ppf(self, q, a, c): - return (-log1p(-q**(1.0/a)))**asarray(1.0/c) -exponweib = exponweib_gen(a=0.0, name='exponweib') - - -class exponpow_gen(rv_continuous): - """An exponential power continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `exponpow` is:: - - exponpow.pdf(x, b) = b * x**(b-1) * exp(1+x**b - exp(x**b)) - - for ``x >= 0``, ``b > 0``. - - %(example)s - - """ - def _pdf(self, x, b): - xbm1 = x**(b-1.0) - xb = xbm1 * x - return exp(1)*b*xbm1 * exp(xb - exp(xb)) - - def _logpdf(self, x, b): - xb = x**(b-1.0)*x - return 1 + log(b) + (b-1.0)*log(x) + xb - exp(xb) - - def _cdf(self, x, b): - return -expm1(-expm1(x**b)) - - def _sf(self, x, b): - return exp(-expm1(x**b)) - - def _isf(self, x, b): - return (log1p(-log(x)))**(1./b) - - def _ppf(self, q, b): - return pow(log1p(-log1p(-q)), 1.0/b) -exponpow = exponpow_gen(a=0.0, name='exponpow') - - -class fatiguelife_gen(rv_continuous): - """A fatigue-life (Birnbaum-Sanders) continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `fatiguelife` is:: - - fatiguelife.pdf(x,c) = - (x+1) / (2*c*sqrt(2*pi*x**3)) * exp(-(x-1)**2/(2*x*c**2)) - - for ``x > 0``. - - %(example)s - - """ - def _rvs(self, c): - z = norm.rvs(size=self._size) - x = 0.5*c*z - x2 = x*x - t = 1.0 + 2*x2 + 2*x*sqrt(1 + x2) - return t - - def _pdf(self, x, c): - return (x+1)/asarray(2*c*sqrt(2*pi*x**3))*exp(-(x-1)**2/asarray((2.0*x*c**2))) - - def _logpdf(self, x, c): - return log(x+1) - (x-1)**2 / (2.0*x*c**2) - log(2*c) - 0.5*(log(2*pi) + 3*log(x)) - - def _cdf(self, x, c): - return special.ndtr(1.0/c*(sqrt(x)-1.0/asarray(sqrt(x)))) - - def _ppf(self, q, c): - tmp = c*special.ndtri(q) - return 0.25*(tmp + sqrt(tmp**2 + 4))**2 - - def _stats(self, c): - c2 = c*c - mu = c2 / 2.0 + 1 - den = 5*c2 + 4 - mu2 = c2*den / 4.0 - g1 = 4*c*sqrt(11*c2+6.0)/np.power(den, 1.5) - g2 = 6*c2*(93*c2+41.0) / den**2.0 - return mu, mu2, g1, g2 -fatiguelife = fatiguelife_gen(a=0.0, name='fatiguelife') - - -class foldcauchy_gen(rv_continuous): - """A folded Cauchy continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `foldcauchy` is:: - - foldcauchy.pdf(x, c) = 1/(pi*(1+(x-c)**2)) + 1/(pi*(1+(x+c)**2)) - - for ``x >= 0``. - - %(example)s - - """ - def _rvs(self, c): - return abs(cauchy.rvs(loc=c,size=self._size)) - - def _pdf(self, x, c): - return 1.0/pi*(1.0/(1+(x-c)**2) + 1.0/(1+(x+c)**2)) - - def _cdf(self, x, c): - return 1.0/pi*(arctan(x-c) + arctan(x+c)) - - def _stats(self, c): - return inf, inf, nan, nan -foldcauchy = foldcauchy_gen(a=0.0, name='foldcauchy') - - -class f_gen(rv_continuous): - """An F continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `f` is:: - - df2**(df2/2) * df1**(df1/2) * x**(df1/2-1) - F.pdf(x, df1, df2) = -------------------------------------------- - (df2+df1*x)**((df1+df2)/2) * B(df1/2, df2/2) - - for ``x > 0``. - - %(example)s - - """ - def _rvs(self, dfn, dfd): - return mtrand.f(dfn, dfd, self._size) - - def _pdf(self, x, dfn, dfd): - return exp(self._logpdf(x, dfn, dfd)) - - def _logpdf(self, x, dfn, dfd): - n = 1.0*dfn - m = 1.0*dfd - lPx = m/2*log(m) + n/2*log(n) + (n/2-1)*log(x) - lPx -= ((n+m)/2)*log(m+n*x) + special.betaln(n/2,m/2) - return lPx - - def _cdf(self, x, dfn, dfd): - return special.fdtr(dfn, dfd, x) - - def _sf(self, x, dfn, dfd): - return special.fdtrc(dfn, dfd, x) - - def _ppf(self, q, dfn, dfd): - return special.fdtri(dfn, dfd, q) - - def _stats(self, dfn, dfd): - v2 = asarray(dfd*1.0) - v1 = asarray(dfn*1.0) - mu = where(v2 > 2, v2 / asarray(v2 - 2), inf) - mu2 = 2*v2*v2*(v2+v1-2)/(v1*(v2-2)**2 * (v2-4)) - mu2 = where(v2 > 4, mu2, inf) - g1 = 2*(v2+2*v1-2)/(v2-6)*sqrt((2*v2-4)/(v1*(v2+v1-2))) - g1 = where(v2 > 6, g1, nan) - g2 = 3/(2*v2-16)*(8+g1*g1*(v2-6)) - g2 = where(v2 > 8, g2, nan) - return mu, mu2, g1, g2 - def _fitstart(self, data): - m = data.mean() - v = data.var() - # Supply a starting guess with method of moments: - dfd = max(np.round(2*m/(m-1)), 5) - dfn = max(np.round(2*dfd*dfd*(dfd-2)/(v*(dfd-4)*(dfd-2)**2 - 2*dfd*dfd)), 1) - return super(f_gen, self)._fitstart(data, args=(dfn,dfd,)) -f = f_gen(a=0.0, name='f') - - -## Folded Normal -## abs(Z) where (Z is normal with mu=L and std=S so that c=abs(L)/S) -## -## note: regress docs have scale parameter correct, but first parameter -## he gives is a shape parameter A = c * scale - -## Half-normal is folded normal with shape-parameter c=0. - -class foldnorm_gen(rv_continuous): - """A folded normal continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `foldnorm` is:: - - foldnormal.pdf(x, c) = sqrt(2/pi) * cosh(c*x) * exp(-(x**2+c**2)/2) - - for ``c >= 0``. - - %(example)s - - """ - def _argcheck(self, c): - return (c >= 0) - - def _rvs(self, c): - return abs(norm.rvs(loc=c,size=self._size)) - - def _pdf(self, x, c): - return sqrt(2.0/pi)*cosh(c*x)*exp(-(x*x+c*c)/2.0) - - def _cdf(self, x, c): - return special.ndtr(x-c) + special.ndtr(x+c) - 1.0 - - def _stats(self, c): - fac = special.erf(c/sqrt(2)) - mu = sqrt(2.0/pi)*exp(-0.5*c*c)+c*fac - mu2 = c*c + 1 - mu*mu - c2 = c*c - g1 = sqrt(2/pi)*exp(-1.5*c2)*(4-pi*exp(c2)*(2*c2+1.0)) - g1 += 2*c*fac*(6*exp(-c2) + 3*sqrt(2*pi)*c*exp(-c2/2.0)*fac + - pi*c*(fac*fac-1)) - g1 /= pi*np.power(mu2, 1.5) - - g2 = c2*c2+6*c2+3+6*(c2+1)*mu*mu - 3*mu**4 - g2 -= 4*exp(-c2/2.0)*mu*(sqrt(2.0/pi)*(c2+2)+c*(c2+3)*exp(c2/2.0)*fac) - g2 /= mu2**2.0 - return mu, mu2, g1, g2 -foldnorm = foldnorm_gen(a=0.0, name='foldnorm') - - -## Extreme Value Type II or Frechet -## (defined in Regress+ documentation as Extreme LB) as -## a limiting value distribution. -## -class frechet_r_gen(rv_continuous): - """A Frechet right (or Weibull minimum) continuous random variable. - - %(before_notes)s - - See Also - -------- - weibull_min : The same distribution as `frechet_r`. - frechet_l, weibull_max - - Notes - ----- - The probability density function for `frechet_r` is:: - - frechet_r.pdf(x, c) = c * x**(c-1) * exp(-x**c) - - for ``x > 0``, ``c > 0``. - - %(example)s - - """ - def link(self, x, logSF, phat, ix): - #u = phat[1] - if ix == 0: - phati = log(-logSF) / log((x - phat[1]) / phat[2]) - elif ix == 1: - phati = x - phat[2] * (-logSF) ** (1. / phat[0]) - elif ix == 2: - phati = (x - phat[1]) / (-logSF) ** (1. / phat[0]) - else: - raise IndexError('Index to the fixed parameter is out of bounds') - return phati - - def _pdf(self, x, c): - return c*pow(x,c-1)*exp(-pow(x,c)) - - def _logpdf(self, x, c): - return log(c) + (c-1)*log(x) - pow(x,c) - - def _cdf(self, x, c): - return -expm1(-pow(x,c)) - - def _ppf(self, q, c): - return pow(-log1p(-q),1.0/c) - - def _munp(self, n, c): - return special.gamma(1.0+n*1.0/c) - - def _entropy(self, c): - return -_EULER / c - log(c) + _EULER + 1 - def _fitstart(self, data): - loc = data.min() - 0.01 #*np.std(data) - chat = 1./(6**(1/2)/pi*np.std(log(data-loc))) - scale = np.mean((data-loc)**chat)**(1./chat) - return chat, loc, scale - -frechet_r = frechet_r_gen(a=0.0, name='frechet_r') -weibull_min = frechet_r_gen(a=0.0, name='weibull_min') - - -class frechet_l_gen(rv_continuous): - """A Frechet left (or Weibull maximum) continuous random variable. - - %(before_notes)s - - See Also - -------- - weibull_max : The same distribution as `frechet_l`. - frechet_r, weibull_min - - Notes - ----- - The probability density function for `frechet_l` is:: - - frechet_l.pdf(x, c) = c * (-x)**(c-1) * exp(-(-x)**c) - - for ``x < 0``, ``c > 0``. - - %(example)s - - """ - def _pdf(self, x, c): - return c*pow(-x,c-1)*exp(-pow(-x,c)) - - def _cdf(self, x, c): - return exp(-pow(-x,c)) - - def _ppf(self, q, c): - return -pow(-log(q),1.0/c) - - def _munp(self, n, c): - val = special.gamma(1.0+n*1.0/c) - if (int(n) % 2): - sgn = -1 - else: - sgn = 1 - return sgn * val - - def _entropy(self, c): - return -_EULER / c - log(c) + _EULER + 1 - def _fitstart(self, data): - loc = data.max() + 0.1*np.std(data) - chat = 1./(6**(1/2)/pi*np.std(log(loc-data))) - scale = np.mean((loc-data)**chat)**(1./chat) - return chat, loc, scale -frechet_l = frechet_l_gen(b=0.0, name='frechet_l') -weibull_max = frechet_l_gen(b=0.0, name='weibull_max') - - -class genlogistic_gen(rv_continuous): - """A generalized logistic continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `genlogistic` is:: - - genlogistic.pdf(x, c) = c * exp(-x) / (1 + exp(-x))**(c+1) - - for ``x > 0``, ``c > 0``. - - %(example)s - - """ - def _pdf(self, x, c): - Px = c*exp(-x)/(1+exp(-x))**(c+1.0) - return Px - - def _logpdf(self, x, c): - return log(c) - x - (c+1.0)*log1p(exp(-x)) - - def _cdf(self, x, c): - Cx = (1+exp(-x))**(-c) - return Cx - - def _ppf(self, q, c): - vals = -log(pow(q,-1.0/c)-1) - return vals - - def _stats(self, c): - zeta = special.zeta - mu = _EULER + special.psi(c) - mu2 = pi*pi/6.0 + zeta(2,c) - g1 = -2*zeta(3,c) + 2*_ZETA3 - g1 /= np.power(mu2, 1.5) - g2 = pi**4/15.0 + 6*zeta(4,c) - g2 /= mu2**2.0 - return mu, mu2, g1, g2 -genlogistic = genlogistic_gen(name='genlogistic') - - -def log1pxdx(x): - '''Computes Log(1+x)/x - ''' - xd = where((x == 0) | (x == inf), 1.0, x) # avoid 0/0 or inf/inf - y = where(x == 0, 1.0, log1p(x) / xd) - return where(x == inf, 0.0, y) - -class genpareto_gen(rv_continuous): - """A generalized Pareto continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `genpareto` is:: - - genpareto.pdf(x, c) = exp(-x) - - for c==0 - - genpareto.pdf(x, c) = (1 + c * x)**(-1 - 1/c) - - for ``c != 0``, and for ``x >= 0`` for all c, - and ``x < 1/abs(c)`` for ``c < 0``. - - %(example)s - - """ - def link(self, x, logSF, phat, ix): - # Reference - # Stuart Coles (2004) - # "An introduction to statistical modelling of extreme values". - # Springer series in statistics - - u = phat[1] - if ix == 0: - raise ValueError('link(x,logSF,phat,i) where i=0 is not implemented!') - elif ix == 2: - # Reorganizing w.r.t. phat[2] (scale), Eq. 4.13 and 4.14, pp 81 in Coles (2004) gives - # link = -(x-phat[1]).*phat[0]/expm1(phat[0]*logSF) - if phat[0] != 0.0: - phati = (x - u) * phat[0] / expm1(-phat[0] * logSF) - else: - phati = -(x - u) / logSF - elif ix == 1: - if phat[0] != 0: - phati = x + phat[2] * expm1(phat[0] * logSF) / phat[0] - else: - phati = x + phat(2) * logSF - else: - raise IndexError('Index to the fixed parameter is out of bounds') - return phati - - def _argcheck(self, c): - c = asarray(c) - sml = floatinfo.machar.xmin # pab avoid division by zero warning - self.b = where(c < 0, 1.0 / (abs(c) + sml), inf) - return where(abs(c) == inf, 0, 1) - - def _pdf(self, x, c): - return exp(self._logpdf(x, c)) - - def _logpdf(self, x, c): - x1 = where((c == 0) & (x == inf), 0.0, x) - cx = where((c == 0) & (x == inf), 0.0, c * x1) - logpdf = where((cx == inf) | (cx == -1), -inf, -(x + cx) * log1pxdx(cx)) - putmask(logpdf, (c == -1) & (x == 1.0), 0.0) - return logpdf - #return (-1.0-1.0/c) * np.log1p(c*x) - def _logsf(self, x, c): - cx = c * x - return where((0.0 < x) & (-1.0 <= cx) & (c != 0), -log1p(cx) / c, -x) - def _cdf(self, x, c): - log_sf = self._logsf(x, c) - return - expm1(log_sf) - #return 1.0 - power(1+c*x,asarray(-1.0/c)) - def _sf(self, x, c): - log_sf = self._logsf(x, c) - return exp(log_sf) - def _ppf(self, q, c): - log_sf = log1p(-q) - return where((c != 0) & (-inf < log_sf), expm1(-c * log_sf) / c, -log_sf) - def _isf(self, q, c): - log_sf = log(q) - return where((c != 0) & (-inf < log_sf), expm1(-c * log_sf) / c, -log_sf) - #vals = 1.0/c * (power(1-q, -c)-1) - #return vals - - def _fitstart(self, data): - d = asarray(data) - loc = d.min() - 0.01 * d.std() - #moments estimator - d1 = d - loc - m = d1.mean() - s = d1.std() - - shape = ((m / s) ** 2 - 1) / 2 - scale = m * ((m / s) ** 2 + 1) / 2 - return shape, loc, scale - - def hessian_nnlf(self, theta, x, eps=None): - try: - loc = theta[-2] - scale = theta[-1] - args = tuple(theta[:-2]) - except IndexError: - raise ValueError, "Not enough input arguments." - if not self._argcheck(*args) or scale <= 0: - return inf - x = asarray((x - loc) / scale) - cond0 = (x <= self.a) | (x >= self.b) - if any(cond0): - np = self.numargs + 2 - return valarray((np, np), value=nan) - eps = floatinfo.machar.eps - c = args[0] - n = len(x) - if abs(c) > eps: - cx = c * x; - sumlog1pcx = sum(log1p(cx)); - #LL = n*log(scale) + (1-1/k)*sumlog1mkxn - r = x / (1.0 + cx) - sumix = sum(1.0 / (1.0 + cx) ** 2.0) - - sumr = sum(r) - sumr2 = sum(r ** 2.0) - H11 = -2 * sumlog1pcx / c ** 3 + 2 * sumr / c ** 2 + (1.0 + 1.0 / c) * sumr2 - H22 = c * (c + 1) * sumix / scale ** 2.0 - H33 = (n - 2 * (c + 1) * sumr + c * (c + 1) * sumr2) / scale ** 2.0; - H12 = -sum((1 - x) / ((1 + cx) ** 2.0)) / scale - H23 = -(c + 1) * sumix / scale ** 2.0 - H13 = -(sumr - (c + 1) * sumr2) / scale; - - - else: # c == 0 - sumx = sum(x); - #LL = n*log(scale) + sumx; - - sumx2 = sum(x ** 2.0); - H11 = -(2 / 3) * sum(x ** 3.0) + sumx2 - H22 = 0.0 - H12 = -(n - sum(x)) / scale - H23 = -n * 1.0 / scale ** 2.0 - H33 = (n - 2 * sumx) / scale ** 2.0 - H13 = -(sumx - sumx2) / scale - - #% Hessian matrix - H = [[H11, H12, H13], [H12, H22, H23], [H13, H23, H33]] - return asarray(H) - def _stats(self, c): - #return None,None,None,None - k = -c - m = where(k < -1.0, inf, 1.0 / (1 + k)) - v = where(k < -0.5, nan, 1.0 / ((1 + k) ** 2.0 * (1 + 2 * k))) - sk = where(k < -1.0 / 3, nan, 2. * (1 - k) * sqrt(1 + 2.0 * k) / (1.0 + 3. * k)) - #% E(X^r) = s^r*(-k)^-(r+1)*gamma(1+r)*gamma(-1/k-r)/gamma(1-1/k) - #% = s^r*gamma(1+r)./( (1+k)*(1+2*k).*....*(1+r*k)) - #% E[(1-k(X-m0)/s)^r] = 1/(1+k*r) - - #%Ex3 = (sk.*sqrt(v)+3*m).*v+m^3 - #%Ex3 = 6.*s.^3/((1+k).*(1+2*k).*(1+3*k)) - r = 4.0; - Ex4 = gam(1. + r) / ((1. + k) * (1. + 2. * k) * (1. + 3. * k) * (1 + 4. * k)) - m1 = m - ku = where(k < -1. / 4, nan, (Ex4 - 4. * sk * v ** (3. / 2) * m1 - 6 * m1 ** 2. * v - m1 ** 4.) / v ** 2. - 3.0) - return m, v, sk, ku - def _munp(self, n, c): - k = arange(0,n+1) - val = (-1.0/c)**n * sum(comb(n,k)*(-1)**k / (1.0-c*k),axis=0) - return where(c*n < 1, val, inf) - - def _entropy(self, c): - if (c >= 0): - return 1 + c - else: - self.b = -1.0 / c - return rv_continuous._entropy(self, c) -genpareto = genpareto_gen(a=0.0, name='genpareto') - - -class genexpon_gen(rv_continuous): - """A generalized exponential continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `genexpon` is:: - - genexpon.pdf(x, a, b, c) = (a + b * (1 - exp(-c*x))) * \ - exp(-a*x - b*x + b/c * (1-exp(-c*x))) - - for ``x >= 0``, ``a,b,c > 0``. - - References - ---------- - H.K. Ryu, "An Extension of Marshall and Olkin's Bivariate Exponential - Distribution", Journal of the American Statistical Association, 1993. - - N. Balakrishnan, "The Exponential Distribution: Theory, Methods and - Applications", Asit P. Basu. - - %(example)s - - """ - def link(self, x, logSF, phat, ix): - xn = (x - phat[3]) / phat[4] - b = phat[1] - c = phat[2] - fact1 = (xn + expm1(-c * xn) / c) - if ix == 0: - phati = b * fact1 + logSF - elif ix == 1: - phati = (phat[0] - logSF) / fact1 - else: - raise IndexError('Only implemented for ix in [1,2]!') - return phati - - def _pdf(self, x, a, b, c): - return (a+b*(-expm1(-c*x)))*exp((-a-b)*x+b*(-expm1(-c*x))/c) - - def _cdf(self, x, a, b, c): - return -expm1((-a-b)*x + b*(-expm1(-c*x))/c) - - def _logpdf(self, x, a, b, c): - return np.log(a+b*(-expm1(-c*x))) + (-a-b)*x+b*(-expm1(-c*x))/c -genexpon = genexpon_gen(a=0.0, name='genexpon') - - -class genextreme_gen(rv_continuous): - """A generalized extreme value continuous random variable. - - %(before_notes)s - - See Also - -------- - gumbel_r - - Notes - ----- - For ``c=0``, `genextreme` is equal to `gumbel_r`. - The probability density function for `genextreme` is:: - - genextreme.pdf(x, c) = - exp(-exp(-x))*exp(-x), for c==0 - exp(-(1-c*x)**(1/c))*(1-c*x)**(1/c-1), for x <= 1/c, c > 0 - - %(example)s - - """ - def _argcheck(self, c): - min = np.minimum - max = np.maximum - sml = floatinfo.machar.xmin - self.b = where(c > 0, 1.0 / max(c, sml),inf) - self.a = where(c < 0, 1.0 / min(c,-sml), -inf) - return where(abs(c) == inf, 0, 1) - - def _pdf(self, x, c): - return exp(self._logpdf(x, c)) - def _logpdf(self, x, c): - x1 = where((c == 0) & (x == inf), 0.0, x) - cx = c * x1 - cond1 = (c==0) * (x==x) - logex2 = where(cond1,0.0,log1p(-cx)) - logpex2 = -x * log1pxdx(-cx) - #logpex2 = where(cond1,-x,logex2/c) - pex2 = exp(logpex2) - # Handle special cases - logpdf = where((cx == 1) | (cx == -inf),-inf,-pex2+logpex2-logex2) - putmask(logpdf, (c == 1) & (x == 1), 0.0) - return exp(logpdf) - - - def _cdf(self, x, c): - return exp(self._logcdf(x, c)) - def _logcdf(self, x, c): - x1 = where((c == 0) & (x == inf), 0.0, x) - cx = c * x1 - loglogcdf = -x * log1pxdx(-cx) - #loglogcdf = where((c==0)*(x==x),-x,log1p(-cx)/c) - return -exp(loglogcdf) - def _sf(self, x, c): - return -expm1(self._logcdf(x, c)) - def _ppf(self, q, c): - x = -log(-log(q)) - return where((c == 0)*(x == x),x,-expm1(-c*x)/c) - - def _stats(self, c): - g = lambda n: gam(n*c+1) - g1 = g(1) - g2 = g(2) - g3 = g(3) - g4 = g(4) - g2mg12 = where(abs(c) < 1e-7,(c*pi)**2.0/6.0,g2-g1**2.0) - gam2k = where(abs(c) < 1e-7,pi**2.0/6.0, expm1(gamln(2.0*c+1.0)-2*gamln(c+1.0))/c**2.0) - eps = 1e-14 - gamk = where(abs(c) < eps,-_EULER,expm1(gamln(c+1))/c) - - m = where(c < -1.0,nan,-gamk) - v = where(c < -0.5,nan,g1**2.0*gam2k) - - # skewness - sk1 = where(c < -1./3,nan,np.sign(c)*(-g3+(g2+2*g2mg12)*g1)/((g2mg12)**(3./2.))) - sk = where(abs(c) <= eps**0.29,12*sqrt(6)*_ZETA3/pi**3,sk1) - - # kurtosis - ku1 = where(c < -1./4,nan,(g4+(-4*g3+3*(g2+g2mg12)*g1)*g1)/((g2mg12)**2)) - ku = where(abs(c) <= (eps)**0.23,12.0/5.0,ku1-3.0) - return m,v,sk,ku - - def _munp(self, n, c): - k = arange(0,n+1) - vals = 1.0/c**n * sum(comb(n,k) * (-1)**k * special.gamma(c*k + 1),axis=0) - return where(c*n > -1, vals, inf) - def _fitstart(self, data): - d = asarray(data) - #Probability weighted moments - log = np.log - n = len(d) - d.sort() - koeff1 = np.r_[0:n] / (n - 1) - koeff2 = koeff1 * (np.r_[0:n] - 1) / (n - 2) - b2 = np.dot(koeff2, d) / n - b1 = np.dot(koeff1, d) / n - b0 = d.mean() - z = (2 * b1 - b0) / (3 * b2 - b0) - log(2) / log(3) - shape = 7.8590 * z + 2.9554 * z ** 2 - scale = (2 * b1 - b0) * shape / (exp(gamln(1 + shape)) * (1 - 2 ** (-shape))) - loc = b0 + scale * (expm1(gamln(1 + shape))) / shape - return shape, loc, scale -genextreme = genextreme_gen(name='genextreme') - - -def _digammainv(y): - # Inverse of the digamma function (real positive arguments only). - # This function is used in the `fit` method of `gamma_gen`. - # The function uses either optimize.fsolve or optimize.newton - # to solve `digamma(x) - y = 0`. There is probably room for - # improvement, but currently it works over a wide range of y: - # >>> y = 64*np.random.randn(1000000) - # >>> y.min(), y.max() - # (-311.43592651416662, 351.77388222276869) - # x = [_digammainv(t) for t in y] - # np.abs(digamma(x) - y).max() - # 1.1368683772161603e-13 - # - _em = 0.5772156649015328606065120 - func = lambda x: special.digamma(x) - y - if y > -0.125: - x0 = exp(y) + 0.5 - if y < 10: - # Some experimentation shows that newton reliably converges - # must faster than fsolve in this y range. For larger y, - # newton sometimes fails to converge. - value = optimize.newton(func, x0, tol=1e-10) - return value - elif y > -3: - x0 = exp(y/2.332) + 0.08661 - else: - x0 = 1.0 / (-y - _em) - - value, info, ier, mesg = optimize.fsolve(func, x0, xtol=1e-11, - full_output=True) - if ier != 1: - raise RuntimeError("_digammainv: fsolve failed, y = %r" % y) - - return value[0] - - -## Gamma (Use MATLAB and MATHEMATICA (b=theta=scale, a=alpha=shape) definition) - -## gamma(a, loc, scale) with a an integer is the Erlang distribution -## gamma(1, loc, scale) is the Exponential distribution -## gamma(df/2, 0, 2) is the chi2 distribution with df degrees of freedom. - -class gamma_gen(rv_continuous): - """A gamma continuous random variable. - - %(before_notes)s - - See Also - -------- - erlang, expon - - Notes - ----- - The probability density function for `gamma` is:: - - gamma.pdf(x, a) = lambda**a * x**(a-1) * exp(-lambda*x) / gamma(a) - - for ``x >= 0``, ``a > 0``. Here ``gamma(a)`` refers to the gamma function. - - The scale parameter is equal to ``scale = 1.0 / lambda``. - - `gamma` has a shape parameter `a` which needs to be set explicitly. For instance: - - >>> from scipy.stats import gamma - >>> rv = gamma(3., loc = 0., scale = 2.) - - produces a frozen form of `gamma` with shape ``a = 3.``, ``loc =0.`` - and ``lambda = 1./scale = 1./2.``. - - When ``a`` is an integer, `gamma` reduces to the Erlang - distribution, and when ``a=1`` to the exponential distribution. - - %(example)s - - """ - def _rvs(self, a): - return mtrand.standard_gamma(a, self._size) - - def _pdf(self, x, a): - return exp(self._logpdf(x, a)) - - def _logpdf(self, x, a): - return special.xlogy(a-1.0, x) - x - gamln(a) - - def _cdf(self, x, a): - return special.gammainc(a, x) - - def _sf(self, x, a): - return special.gammaincc(a, x) - - def _ppf(self, q, a): - return special.gammaincinv(a,q) - - def _stats(self, a): - return a, a, 2.0/sqrt(a), 6.0/a - - def _entropy(self, a): - return special.psi(a)*(1-a) + 1 + gamln(a) - - def _fitstart(self, data): - # The skewness of the gamma distribution is `4 / sqrt(a)`. - # We invert that to estimate the shape `a` using the skewness - # of the data. The formula is regularized with 1e-8 in the - # denominator to allow for degenerate data where the skewness - # is close to 0. - a = 4 / (1e-8 + _skew(data)**2) - return super(gamma_gen, self)._fitstart(data, args=(a,)) - - @inherit_docstring_from(rv_continuous) - def fit(self, data, *args, **kwds): - f0 = kwds.get('f0', None) - floc = kwds.get('floc', None) - fscale = kwds.get('fscale', None) - - if floc is None: - # loc is not fixed. Use the default fit method. - return super(gamma_gen, self).fit(data, *args, **kwds) - - # Special case: loc is fixed. - - if f0 is not None and fscale is not None: - # This check is for consistency with `rv_continuous.fit`. - # Without this check, this function would just return the - # parameters that were given. - raise ValueError("All parameters fixed. There is nothing to " - "optimize.") - - # Fixed location is handled by shifting the data. - data = np.asarray(data) - if np.any(data <= floc): - raise FitDataError("gamma", lower=floc, upper=np.inf) - if floc != 0: - # Don't do the subtraction in-place, because `data` might be a - # view of the input array. - data = data - floc - xbar = data.mean() - - # Three cases to handle: - # * shape and scale both free - # * shape fixed, scale free - # * shape free, scale fixed - - if fscale is None: - # scale is free - if f0 is not None: - # shape is fixed - a = f0 - else: - # shape and scale are both free. - # The MLE for the shape parameter `a` is the solution to: - # log(a) - special.digamma(a) - log(xbar) + log(data.mean) = 0 - s = log(xbar) - log(data).mean() - func = lambda a: log(a) - special.digamma(a) - s - aest = (3-s + math.sqrt((s-3)**2 + 24*s)) / (12*s) - xa = aest*(1-0.4) - xb = aest*(1+0.4) - a = optimize.brentq(func, xa, xb, disp=0) - - # The MLE for the scale parameter is just the data mean - # divided by the shape parameter. - scale = xbar / a - else: - # scale is fixed, shape is free - # The MLE for the shape parameter `a` is the solution to: - # special.digamma(a) - log(data).mean() + log(fscale) = 0 - c = log(data).mean() - log(fscale) - a = _digammainv(c) - scale = fscale - - return a, floc, scale - -gamma = gamma_gen(a=0.0, name='gamma') - - -class erlang_gen(gamma_gen): - """An Erlang continuous random variable. - - %(before_notes)s - - See Also - -------- - gamma - - Notes - ----- - The Erlang distribution is a special case of the Gamma distribution, with - the shape parameter `a` an integer. Note that this restriction is not - enforced by `erlang`. It will, however, generate a warning the first time - a non-integer value is used for the shape parameter. - - Refer to `gamma` for examples. - - """ - - def _argcheck(self, a): - allint = np.all(np.floor(a) == a) - allpos = np.all(a > 0) - if not allint: - # An Erlang distribution shouldn't really have a non-integer - # shape parameter, so warn the user. - warnings.warn('The shape parameter of the erlang distribution ' - 'has been given a non-integer value %r.' % (a,), - RuntimeWarning) - return allpos - - def _fitstart(self, data): - # Override gamma_gen_fitstart so that an integer initial value is - # used. (Also regularize the division, to avoid issues when - # _skew(data) is 0 or close to 0.) - a = int(4.0 / (1e-8 + _skew(data)**2)) - return super(gamma_gen, self)._fitstart(data, args=(a,)) - - # Trivial override of the fit method, so we can monkey-patch its - # docstring. - def fit(self, data, *args, **kwds): - return super(erlang_gen, self).fit(data, *args, **kwds) - - if fit.__doc__ is not None: - fit.__doc__ = (rv_continuous.fit.__doc__ + - """ - Notes - ----- - The Erlang distribution is generally defined to have integer values - for the shape parameter. This is not enforced by the `erlang` class. - When fitting the distribution, it will generally return a non-integer - value for the shape parameter. By using the keyword argument - `f0=`, the fit method can be constrained to fit the data to - a specific integer shape parameter. - """) -erlang = erlang_gen(a=0.0, name='erlang') - - -class gengamma_gen(rv_continuous): - """A generalized gamma continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `gengamma` is:: - - gengamma.pdf(x, a, c) = abs(c) * x**(c*a-1) * exp(-x**c) / gamma(a) - - for ``x > 0``, ``a > 0``, and ``c != 0``. - - %(example)s - - """ - def _argcheck(self, a, c): - return (a > 0) & (c != 0) - - def _pdf(self, x, a, c): - return abs(c) * exp((c*a-1)*log(x)-x**c - gamln(a)) - - def _cdf(self, x, a, c): - val = special.gammainc(a,x**c) - cond = c + 0*val - return where(cond > 0,val,1-val) - - def _ppf(self, q, a, c): - val1 = special.gammaincinv(a,q) - val2 = special.gammaincinv(a,1.0-q) - ic = 1.0/c - cond = c+0*val1 - return where(cond > 0,val1**ic,val2**ic) - - def _munp(self, n, a, c): - return special.gamma(a+n*1.0/c) / special.gamma(a) - - def _entropy(self, a,c): - val = special.psi(a) - return a*(1-val) + 1.0/c*val + gamln(a)-log(abs(c)) -gengamma = gengamma_gen(a=0.0, name='gengamma') - - -class genhalflogistic_gen(rv_continuous): - """A generalized half-logistic continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `genhalflogistic` is:: - - genhalflogistic.pdf(x, c) = 2 * (1-c*x)**(1/c-1) / (1+(1-c*x)**(1/c))**2 - - for ``0 <= x <= 1/c``, and ``c > 0``. - - %(example)s - - """ - def _argcheck(self, c): - self.b = 1.0 / c - return (c > 0) - - def _pdf(self, x, c): - limit = 1.0/c - tmp = asarray(1-c*x) - tmp0 = tmp**(limit-1) - tmp2 = tmp0*tmp - return 2*tmp0 / (1+tmp2)**2 - - def _cdf(self, x, c): - limit = 1.0/c - tmp = asarray(1-c*x) - tmp2 = tmp**(limit) - return (1.0-tmp2) / (1+tmp2) - - def _ppf(self, q, c): - return 1.0/c*(1-((1.0-q)/(1.0+q))**c) - - def _entropy(self,c): - return 2 - (2*c+1)*log(2) -genhalflogistic = genhalflogistic_gen(a=0.0, name='genhalflogistic') - - -class gompertz_gen(rv_continuous): - """A Gompertz (or truncated Gumbel) continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `gompertz` is:: - - gompertz.pdf(x, c) = c * exp(x) * exp(-c*(exp(x)-1)) - - for ``x >= 0``, ``c > 0``. - - %(example)s - - """ - def _pdf(self, x, c): - exm1 = expm1(x) - return c*exp(x)*exp(-c*exm1) - - def _cdf(self, x, c): - return -expm1(-c*expm1(x)) - - def _ppf(self, q, c): - return log1p(-1.0/c*log1p(-q)) - - def _entropy(self, c): - return 1.0 - log(c) - exp(c)*special.expn(1,c) -gompertz = gompertz_gen(a=0.0, name='gompertz') - - -class gumbel_r_gen(rv_continuous): - """A right-skewed Gumbel continuous random variable. - - %(before_notes)s - - See Also - -------- - gumbel_l, gompertz, genextreme - - Notes - ----- - The probability density function for `gumbel_r` is:: - - gumbel_r.pdf(x) = exp(-(x + exp(-x))) - - The Gumbel distribution is sometimes referred to as a type I Fisher-Tippett - distribution. It is also related to the extreme value distribution, - log-Weibull and Gompertz distributions. - - %(example)s - - """ - def _pdf(self, x): - ex = exp(-x) - return ex*exp(-ex) - - def _logpdf(self, x): - return -x - exp(-x) - - def _cdf(self, x): - return exp(-exp(-x)) - - def _logcdf(self, x): - return -exp(-x) - - def _ppf(self, q): - return -log(-log(q)) - - def _stats(self): - return _EULER, pi*pi/6.0, \ - 12*sqrt(6)/pi**3 * _ZETA3, 12.0/5 - - def _entropy(self): - return 1.0608407169541684911 -gumbel_r = gumbel_r_gen(name='gumbel_r') - - -class gumbel_l_gen(rv_continuous): - """A left-skewed Gumbel continuous random variable. - - %(before_notes)s - - See Also - -------- - gumbel_r, gompertz, genextreme - - Notes - ----- - The probability density function for `gumbel_l` is:: - - gumbel_l.pdf(x) = exp(x - exp(x)) - - The Gumbel distribution is sometimes referred to as a type I Fisher-Tippett - distribution. It is also related to the extreme value distribution, - log-Weibull and Gompertz distributions. - - %(example)s - - """ - def _pdf(self, x): - ex = exp(x) - return ex*exp(-ex) - - def _logpdf(self, x): - return x - exp(x) - - def _cdf(self, x): - return -expm1(-exp(x)) - - def _ppf(self, q): - return log(-log1p(-q)) - - def _stats(self): - return -_EULER, pi*pi/6.0, \ - -12*sqrt(6)/pi**3 * _ZETA3, 12.0/5 - - def _entropy(self): - return 1.0608407169541684911 -gumbel_l = gumbel_l_gen(name='gumbel_l') - - -class halfcauchy_gen(rv_continuous): - """A Half-Cauchy continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `halfcauchy` is:: - - halfcauchy.pdf(x) = 2 / (pi * (1 + x**2)) - - for ``x >= 0``. - - %(example)s - - """ - def _pdf(self, x): - return 2.0/pi/(1.0+x*x) - - def _logpdf(self, x): - return np.log(2.0/pi) - np.log1p(x*x) - - def _cdf(self, x): - return 2.0/pi*arctan(x) - - def _ppf(self, q): - return tan(pi/2*q) - - def _stats(self): - return inf, inf, nan, nan - - def _entropy(self): - return log(2*pi) -halfcauchy = halfcauchy_gen(a=0.0, name='halfcauchy') - - -class halflogistic_gen(rv_continuous): - """A half-logistic continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `halflogistic` is:: - - halflogistic.pdf(x) = 2 * exp(-x) / (1+exp(-x))**2 = 1/2 * sech(x/2)**2 - - for ``x >= 0``. - - %(example)s - - """ - def _pdf(self, x): - return 0.5/(cosh(x/2.0))**2.0 - - def _cdf(self, x): - return tanh(x/2.0) - - def _ppf(self, q): - return 2*arctanh(q) - - def _munp(self, n): - if n == 1: - return 2*log(2) - if n == 2: - return pi*pi/3.0 - if n == 3: - return 9*_ZETA3 - if n == 4: - return 7*pi**4 / 15.0 - return 2*(1-pow(2.0,1-n))*special.gamma(n+1)*special.zeta(n,1) - - def _entropy(self): - return 2-log(2) -halflogistic = halflogistic_gen(a=0.0, name='halflogistic') - - -class halfnorm_gen(rv_continuous): - """A half-normal continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `halfnorm` is:: - - halfnorm.pdf(x) = sqrt(2/pi) * exp(-x**2/2) - - for ``x > 0``. - - `halfnorm` is a special case of `chi` with ``df == 1``. - - %(example)s - - """ - def _rvs(self): - return abs(norm.rvs(size=self._size)) - - def _pdf(self, x): - return sqrt(2.0/pi)*exp(-x*x/2.0) - - def _logpdf(self, x): - return 0.5 * np.log(2.0/pi) - x*x/2.0 - - def _cdf(self, x): - return special.ndtr(x)*2-1.0 - - def _ppf(self, q): - return special.ndtri((1+q)/2.0) - - def _stats(self): - return sqrt(2.0/pi), 1-2.0/pi, sqrt(2)*(4-pi)/(pi-2)**1.5, \ - 8*(pi-3)/(pi-2)**2 - - def _entropy(self): - return 0.5*log(pi/2.0)+0.5 -halfnorm = halfnorm_gen(a=0.0, name='halfnorm') - - -class hypsecant_gen(rv_continuous): - """A hyperbolic secant continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `hypsecant` is:: - - hypsecant.pdf(x) = 1/pi * sech(x) - - %(example)s - - """ - def _pdf(self, x): - return 1.0/(pi*cosh(x)) - - def _cdf(self, x): - return 2.0/pi*arctan(exp(x)) - - def _ppf(self, q): - return log(tan(pi*q/2.0)) - - def _stats(self): - return 0, pi*pi/4, 0, 2 - - def _entropy(self): - return log(2*pi) -hypsecant = hypsecant_gen(name='hypsecant') - - -class gausshyper_gen(rv_continuous): - """A Gauss hypergeometric continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `gausshyper` is:: - - gausshyper.pdf(x, a, b, c, z) = - C * x**(a-1) * (1-x)**(b-1) * (1+z*x)**(-c) - - for ``0 <= x <= 1``, ``a > 0``, ``b > 0``, and - ``C = 1 / (B(a,b) F[2,1](c, a; a+b; -z))`` - - %(example)s - - """ - def _argcheck(self, a, b, c, z): - return (a > 0) & (b > 0) & (c == c) & (z == z) - - def _pdf(self, x, a, b, c, z): - Cinv = gam(a)*gam(b)/gam(a+b)*special.hyp2f1(c,a,a+b,-z) - return 1.0/Cinv * x**(a-1.0) * (1.0-x)**(b-1.0) / (1.0+z*x)**c - - def _munp(self, n, a, b, c, z): - fac = special.beta(n+a,b) / special.beta(a,b) - num = special.hyp2f1(c,a+n,a+b+n,-z) - den = special.hyp2f1(c,a,a+b,-z) - return fac*num / den -gausshyper = gausshyper_gen(a=0.0, b=1.0, name='gausshyper') - - -class invgamma_gen(rv_continuous): - """An inverted gamma continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `invgamma` is:: - - invgamma.pdf(x, a) = x**(-a-1) / gamma(a) * exp(-1/x) - - for x > 0, a > 0. - - `invgamma` is a special case of `gengamma` with ``c == -1``. - - %(example)s - - """ - def _pdf(self, x, a): - return exp(self._logpdf(x,a)) - - def _logpdf(self, x, a): - return (-(a+1)*log(x)-gamln(a) - 1.0/x) - - def _cdf(self, x, a): - return 1.0-special.gammainc(a, 1.0/x) - - def _ppf(self, q, a): - return 1.0/special.gammaincinv(a,1-q) - - def _munp(self, n, a): - return exp(gamln(a-n) - gamln(a)) - - def _entropy(self, a): - return a - (a+1.0)*special.psi(a) + gamln(a) -invgamma = invgamma_gen(a=0.0, name='invgamma') - - -# scale is gamma from DATAPLOT and B from Regress -class invgauss_gen(rv_continuous): - """An inverse Gaussian continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `invgauss` is:: - - invgauss.pdf(x, mu) = 1 / sqrt(2*pi*x**3) * exp(-(x-mu)**2/(2*x*mu**2)) - - for ``x > 0``. - - When `mu` is too small, evaluating the cumulative density function will be - inaccurate due to ``cdf(mu -> 0) = inf * 0``. - NaNs are returned for ``mu <= 0.0028``. - - %(example)s - - """ - def _rvs(self, mu): - return mtrand.wald(mu, 1.0, size=self._size) - - def _pdf(self, x, mu): - return 1.0/sqrt(2*pi*x**3.0)*exp(-1.0/(2*x)*((x-mu)/mu)**2) - - def _logpdf(self, x, mu): - return -0.5*log(2*pi) - 1.5*log(x) - ((x-mu)/mu)**2/(2*x) - - def _cdf(self, x, mu): - fac = sqrt(1.0/x) - # Numerical accuracy for small `mu` is bad. See #869. - C1 = norm.cdf(fac*(x-mu)/mu) - C1 += exp(1.0/mu) * norm.cdf(-fac*(x+mu)/mu) * exp(1.0/mu) - return C1 - - def _stats(self, mu): - return mu, mu**3.0, 3*sqrt(mu), 15*mu -invgauss = invgauss_gen(a=0.0, name='invgauss') - - -class invweibull_gen(rv_continuous): - """An inverted Weibull continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `invweibull` is:: - - invweibull.pdf(x, c) = c * x**(-c-1) * exp(-x**(-c)) - - for ``x > 0``, ``c > 0``. - - References - ---------- - F.R.S. de Gusmao, E.M.M Ortega and G.M. Cordeiro, "The generalized inverse - Weibull distribution", Stat. Papers, vol. 52, pp. 591-619, 2011. - - %(example)s - - """ - def _pdf(self, x, c): - xc1 = x**(-c-1.0) - xc2 = x**(-c) - xc2 = exp(-xc2) - return c*xc1*xc2 - - def _cdf(self, x, c): - xc1 = x**(-c) - return exp(-xc1) - - def _ppf(self, q, c): - return pow(-log(q),asarray(-1.0/c)) - - def _munp(self, n, c): - return special.gamma(1 - n / c) - - def _entropy(self, c): - return 1+_EULER + _EULER / c - log(c) -invweibull = invweibull_gen(a=0, name='invweibull') - - -class johnsonsb_gen(rv_continuous): - """A Johnson SB continuous random variable. - - %(before_notes)s - - See Also - -------- - johnsonsu - - Notes - ----- - The probability density function for `johnsonsb` is:: - - johnsonsb.pdf(x, a, b) = b / (x*(1-x)) * phi(a + b * log(x/(1-x))) - - for ``0 < x < 1`` and ``a,b > 0``, and ``phi`` is the normal pdf. - - %(example)s - - """ - def _argcheck(self, a, b): - return (b > 0) & (a == a) - - def _pdf(self, x, a, b): - trm = norm.pdf(a+b*log(x/(1.0-x))) - return b*1.0/(x*(1-x))*trm - - def _cdf(self, x, a, b): - return norm.cdf(a+b*log(x/(1.0-x))) - - def _ppf(self, q, a, b): - return 1.0/(1+exp(-1.0/b*(norm.ppf(q)-a))) -johnsonsb = johnsonsb_gen(a=0.0, b=1.0, name='johnsonb') - - -class johnsonsu_gen(rv_continuous): - """A Johnson SU continuous random variable. - - %(before_notes)s - - See Also - -------- - johnsonsb - - Notes - ----- - The probability density function for `johnsonsu` is:: - - johnsonsu.pdf(x, a, b) = b / sqrt(x**2 + 1) * - phi(a + b * log(x + sqrt(x**2 + 1))) - - for all ``x, a, b > 0``, and `phi` is the normal pdf. - - %(example)s - - """ - def _argcheck(self, a, b): - return (b > 0) & (a == a) - - def _pdf(self, x, a, b): - x2 = x*x - trm = norm.pdf(a+b*log(x+sqrt(x2+1))) - return b*1.0/sqrt(x2+1.0)*trm - - def _cdf(self, x, a, b): - return norm.cdf(a+b*log(x+sqrt(x*x+1))) - - def _ppf(self, q, a, b): - return sinh((norm.ppf(q)-a)/b) -johnsonsu = johnsonsu_gen(name='johnsonsu') - - -class laplace_gen(rv_continuous): - """A Laplace continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `laplace` is:: - - laplace.pdf(x) = 1/2 * exp(-abs(x)) - - %(example)s - - """ - def _rvs(self): - return mtrand.laplace(0, 1, size=self._size) - - def _pdf(self, x): - return 0.5*exp(-abs(x)) - - def _cdf(self, x): - return where(x > 0, 1.0-0.5*exp(-x), 0.5*exp(x)) - - def _ppf(self, q): - return where(q > 0.5, -log(2*(1-q)), log(2*q)) - - def _stats(self): - return 0, 2, 0, 3 - - def _entropy(self): - return log(2)+1 -laplace = laplace_gen(name='laplace') - - -class levy_gen(rv_continuous): - """A Levy continuous random variable. - - %(before_notes)s - - See Also - -------- - levy_stable, levy_l - - Notes - ----- - The probability density function for `levy` is:: - - levy.pdf(x) = 1 / (x * sqrt(2*pi*x)) * exp(-1/(2*x)) - - for ``x > 0``. - - This is the same as the Levy-stable distribution with a=1/2 and b=1. - - %(example)s - - """ - def _pdf(self, x): - return 1/sqrt(2*pi*x)/x*exp(-1/(2*x)) - - def _cdf(self, x): - return 2*(1-norm._cdf(1/sqrt(x))) - - def _ppf(self, q): - val = norm._ppf(1-q/2.0) - return 1.0/(val*val) - - def _stats(self): - return inf, inf, nan, nan -levy = levy_gen(a=0.0,name="levy") - - -class levy_l_gen(rv_continuous): - """A left-skewed Levy continuous random variable. - - %(before_notes)s - - See Also - -------- - levy, levy_stable - - Notes - ----- - The probability density function for `levy_l` is:: - - levy_l.pdf(x) = 1 / (abs(x) * sqrt(2*pi*abs(x))) * exp(-1/(2*abs(x))) - - for ``x < 0``. - - This is the same as the Levy-stable distribution with a=1/2 and b=-1. - - %(example)s - - """ - def _pdf(self, x): - ax = abs(x) - return 1/sqrt(2*pi*ax)/ax*exp(-1/(2*ax)) - - def _cdf(self, x): - ax = abs(x) - return 2*norm._cdf(1/sqrt(ax))-1 - - def _ppf(self, q): - val = norm._ppf((q+1.0)/2) - return -1.0/(val*val) - - def _stats(self): - return inf, inf, nan, nan -levy_l = levy_l_gen(b=0.0, name="levy_l") - - -class levy_stable_gen(rv_continuous): - """A Levy-stable continuous random variable. - - %(before_notes)s - - See Also - -------- - levy, levy_l - - Notes - ----- - Levy-stable distribution (only random variates available -- ignore other - docs) - - %(example)s - - """ - def _rvs(self, alpha, beta): - sz = self._size - TH = uniform.rvs(loc=-pi/2.0,scale=pi,size=sz) - W = expon.rvs(size=sz) - if alpha == 1: - return 2/pi*(pi/2+beta*TH)*tan(TH)-beta*log((pi/2*W*cos(TH))/(pi/2+beta*TH)) - - ialpha = 1.0/alpha - aTH = alpha*TH - if beta == 0: - return W/(cos(TH)/tan(aTH)+sin(TH))*((cos(aTH)+sin(aTH)*tan(TH))/W)**ialpha - - val0 = beta*tan(pi*alpha/2) - th0 = arctan(val0)/alpha - val3 = W/(cos(TH)/tan(alpha*(th0+TH))+sin(TH)) - res3 = val3*((cos(aTH)+sin(aTH)*tan(TH)-val0*(sin(aTH)-cos(aTH)*tan(TH)))/W)**ialpha - return res3 - - def _argcheck(self, alpha, beta): - if beta == -1: - self.b = 0.0 - elif beta == 1: - self.a = 0.0 - return (alpha > 0) & (alpha <= 2) & (beta <= 1) & (beta >= -1) - - def _pdf(self, x, alpha, beta): - raise NotImplementedError -levy_stable = levy_stable_gen(name='levy_stable') - - -class logistic_gen(rv_continuous): - """A logistic (or Sech-squared) continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `logistic` is:: - - logistic.pdf(x) = exp(-x) / (1+exp(-x))**2 - - `logistic` is a special case of `genlogistic` with ``c == 1``. - - %(example)s - - """ - def _rvs(self): - return mtrand.logistic(size=self._size) - - def _pdf(self, x): - ex = exp(-x) - return ex / (1+ex)**2.0 - - def _cdf(self, x): - return 1.0/(1+exp(-x)) - - def _ppf(self, q): - return -log(1.0/q-1) - - def _stats(self): - return 0, pi*pi/3.0, 0, 6.0/5.0 - - def _entropy(self): - return 1.0 -logistic = logistic_gen(name='logistic') - - -class loggamma_gen(rv_continuous): - """A log gamma continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `loggamma` is:: - - loggamma.pdf(x, c) = exp(c*x-exp(x)) / gamma(c) - - for all ``x, c > 0``. - - %(example)s - - """ - def _rvs(self, c): - return log(mtrand.gamma(c, size=self._size)) - - def _pdf(self, x, c): - return exp(c*x-exp(x)-gamln(c)) - - def _cdf(self, x, c): - return special.gammainc(c, exp(x)) - - def _ppf(self, q, c): - return log(special.gammaincinv(c,q)) - - def _stats(self, c): - # See, for example, "A Statistical Study of Log-Gamma Distribution", by - # Ping Shing Chan (thesis, McMaster University, 1993). - mean = special.digamma(c) - var = special.polygamma(1, c) - skewness = special.polygamma(2, c) / np.power(var, 1.5) - excess_kurtosis = special.polygamma(3, c) / (var*var) - return mean, var, skewness, excess_kurtosis - -loggamma = loggamma_gen(name='loggamma') - - -class loglaplace_gen(rv_continuous): - """A log-Laplace continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `loglaplace` is:: - - loglaplace.pdf(x, c) = c / 2 * x**(c-1), for 0 < x < 1 - = c / 2 * x**(-c-1), for x >= 1 - - for ``c > 0``. - - References - ---------- - T.J. Kozubowski and K. Podgorski, "A log-Laplace growth rate model", - The Mathematical Scientist, vol. 28, pp. 49-60, 2003. - - %(example)s - - """ - def _pdf(self, x, c): - cd2 = c/2.0 - c = where(x < 1, c, -c) - return cd2*x**(c-1) - - def _cdf(self, x, c): - return where(x < 1, 0.5*x**c, 1-0.5*x**(-c)) - - def _ppf(self, q, c): - return where(q < 0.5, (2.0*q)**(1.0/c), (2*(1.0-q))**(-1.0/c)) - - def _munp(self, n, c): - return c**2 / (c**2 - n**2) - - def _entropy(self, c): - return log(2.0/c) + 1.0 -loglaplace = loglaplace_gen(a=0.0, name='loglaplace') - - -def _lognorm_logpdf(x, s): - return -log(x)**2 / (2*s**2) + np.where(x == 0, 0, -log(s*x*sqrt(2*pi))) - - -class lognorm_gen(rv_continuous): - """A lognormal continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `lognorm` is:: - - lognorm.pdf(x, s) = 1 / (s*x*sqrt(2*pi)) * exp(-1/2*(log(x)/s)**2) - - for ``x > 0``, ``s > 0``. - - If ``log(x)`` is normally distributed with mean ``mu`` and variance ``sigma**2``, - then ``x`` is log-normally distributed with shape parameter sigma and scale - parameter ``exp(mu)``. - - %(example)s - - """ - def _rvs(self, s): - return exp(s * mtrand.standard_normal(self._size)) - - def _pdf(self, x, s): - return exp(self._logpdf(x, s)) - - def _logpdf(self, x, s): - return _lognorm_logpdf(x, s) - - def _cdf(self, x, s): - return _norm_cdf(log(x) / s) - - def _ppf(self, q, s): - return exp(s * _norm_ppf(q)) - - def _stats(self, s): - p = exp(s*s) - mu = sqrt(p) - mu2 = p*(p-1) - g1 = sqrt((p-1))*(2+p) - g2 = numpy.polyval([1,2,3,0,-6.0],p) - return mu, mu2, g1, g2 - - def _entropy(self, s): - return 0.5 * (1 + log(2*pi) + 2 * log(s)) - def _fitstart(self, data): - scale = data.std() - loc = data.min()-0.001 - logd = log(data-loc) - m = logd.mean() - s = sqrt((logd**2).mean() - m**2) - return s, loc, scale -lognorm = lognorm_gen(a=0.0, name='lognorm') - - -class gilbrat_gen(rv_continuous): - """A Gilbrat continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `gilbrat` is:: - - gilbrat.pdf(x) = 1/(x*sqrt(2*pi)) * exp(-1/2*(log(x))**2) - - `gilbrat` is a special case of `lognorm` with ``s = 1``. - - %(example)s - - """ - def _rvs(self): - return exp(mtrand.standard_normal(self._size)) - - def _pdf(self, x): - return exp(self._logpdf(x)) - - def _logpdf(self, x): - return _lognorm_logpdf(x, 1.0) - - def _cdf(self, x): - return _norm_cdf(log(x)) - - def _ppf(self, q): - return exp(_norm_ppf(q)) - - def _stats(self): - p = np.e - mu = sqrt(p) - mu2 = p * (p - 1) - g1 = sqrt((p - 1)) * (2 + p) - g2 = numpy.polyval([1, 2, 3, 0, -6.0], p) - return mu, mu2, g1, g2 - - def _entropy(self): - return 0.5 * log(2 * pi) + 0.5 - def _fitstart(self, data): - scale = data.std() - loc = data.min()-0.001 - return loc, scale -gilbrat = gilbrat_gen(a=0.0, name='gilbrat') - - -class maxwell_gen(rv_continuous): - """A Maxwell continuous random variable. - - %(before_notes)s - - Notes - ----- - A special case of a `chi` distribution, with ``df = 3``, ``loc = 0.0``, - and given ``scale = a``, where ``a`` is the parameter used in the - Mathworld description [1]_. - - The probability density function for `maxwell` is:: - - maxwell.pdf(x) = sqrt(2/pi)x**2 * exp(-x**2/2) - - for ``x > 0``. - - References - ---------- - .. [1] http://mathworld.wolfram.com/MaxwellDistribution.html - - %(example)s - """ - def _rvs(self): - return chi.rvs(3.0,size=self._size) - - def _pdf(self, x): - return sqrt(2.0/pi)*x*x*exp(-x*x/2.0) - - def _cdf(self, x): - return special.gammainc(1.5,x*x/2.0) - - def _ppf(self, q): - return sqrt(2*special.gammaincinv(1.5,q)) - - def _stats(self): - val = 3*pi-8 - return 2*sqrt(2.0/pi), 3-8/pi, sqrt(2)*(32-10*pi)/val**1.5, \ - (-12*pi*pi + 160*pi - 384) / val**2.0 - - def _entropy(self): - return _EULER + 0.5*log(2*pi)-0.5 -maxwell = maxwell_gen(a=0.0, name='maxwell') - - -class mielke_gen(rv_continuous): - """A Mielke's Beta-Kappa continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `mielke` is:: - - mielke.pdf(x, k, s) = k * x**(k-1) / (1+x**s)**(1+k/s) - - for ``x > 0``. - - %(example)s - - """ - def _pdf(self, x, k, s): - return k*x**(k-1.0) / (1.0+x**s)**(1.0+k*1.0/s) - - def _cdf(self, x, k, s): - return x**k / (1.0+x**s)**(k*1.0/s) - - def _ppf(self, q, k, s): - qsk = pow(q,s*1.0/k) - return pow(qsk/(1.0-qsk),1.0/s) -mielke = mielke_gen(a=0.0, name='mielke') - - -class nakagami_gen(rv_continuous): - """A Nakagami continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `nakagami` is:: - - nakagami.pdf(x, nu) = 2 * nu**nu / gamma(nu) * - x**(2*nu-1) * exp(-nu*x**2) - - for ``x > 0``, ``nu > 0``. - - %(example)s - - """ - def _pdf(self, x, nu): - return 2*nu**nu/gam(nu)*(x**(2*nu-1.0))*exp(-nu*x*x) - - def _cdf(self, x, nu): - return special.gammainc(nu,nu*x*x) - - def _ppf(self, q, nu): - return sqrt(1.0/nu*special.gammaincinv(nu,q)) - - def _stats(self, nu): - mu = gam(nu+0.5)/gam(nu)/sqrt(nu) - mu2 = 1.0-mu*mu - g1 = mu * (1 - 4*nu*mu2) / 2.0 / nu / np.power(mu2, 1.5) - g2 = -6*mu**4*nu + (8*nu-2)*mu**2-2*nu + 1 - g2 /= nu*mu2**2.0 - return mu, mu2, g1, g2 -nakagami = nakagami_gen(a=0.0, name="nakagami") - - -class ncx2_gen(rv_continuous): - """A non-central chi-squared continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `ncx2` is:: - - ncx2.pdf(x, df, nc) = exp(-(nc+df)/2) * 1/2 * (x/nc)**((df-2)/4) - * I[(df-2)/2](sqrt(nc*x)) - - for ``x > 0``. - - %(example)s - - """ - def _rvs(self, df, nc): - return mtrand.noncentral_chisquare(df,nc,self._size) - - def _logpdf(self, x, df, nc): - a = asarray(df/2.0) - fac = -nc/2.0 - x/2.0 + (a-1)*np.log(x) - a*np.log(2) - special.gammaln(a) - return fac + np.nan_to_num(np.log(special.hyp0f1(a, nc * x/4.0))) - - def _pdf(self, x, df, nc): - return np.exp(self._logpdf(x, df, nc)) - - def _cdf(self, x, df, nc): - return special.chndtr(x,df,nc) - - def _ppf(self, q, df, nc): - return special.chndtrix(q,df,nc) - - def _stats(self, df, nc): - val = df + 2.0*nc - return df + nc, 2*val, sqrt(8)*(val+nc)/val**1.5, \ - 12.0*(val+2*nc)/val**2.0 - def _fitstart(self, data): - m = data.mean() - v = data.var() - # Supply a starting guess with method of moments: - nc = (v/2-m)/2 - df = m-nc - return super(ncx2_gen, self)._fitstart(data, args=(df, nc)) -ncx2 = ncx2_gen(a=0.0, name='ncx2') - - -class ncf_gen(rv_continuous): - """A non-central F distribution continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `ncf` is:: - - ncf.pdf(x, df1, df2, nc) = exp(nc/2 + nc*df1*x/(2*(df1*x+df2))) - * df1**(df1/2) * df2**(df2/2) * x**(df1/2-1) - * (df2+df1*x)**(-(df1+df2)/2) - * gamma(df1/2)*gamma(1+df2/2) - * L^{v1/2-1}^{v2/2}(-nc*v1*x/(2*(v1*x+v2))) - / (B(v1/2, v2/2) * gamma((v1+v2)/2)) - - for ``df1, df2, nc > 0``. - - %(example)s - - """ - def _rvs(self, dfn, dfd, nc): - return mtrand.noncentral_f(dfn,dfd,nc,self._size) - - def _pdf_skip(self, x, dfn, dfd, nc): - n1,n2 = dfn, dfd - term = -nc/2+nc*n1*x/(2*(n2+n1*x)) + gamln(n1/2.)+gamln(1+n2/2.) - term -= gamln((n1+n2)/2.0) - Px = exp(term) - Px *= n1**(n1/2) * n2**(n2/2) * x**(n1/2-1) - Px *= (n2+n1*x)**(-(n1+n2)/2) - Px *= special.assoc_laguerre(-nc*n1*x/(2.0*(n2+n1*x)),n2/2,n1/2-1) - Px /= special.beta(n1/2,n2/2) - # this function does not have a return - # drop it for now, the generic function seems to work ok - - def _cdf(self, x, dfn, dfd, nc): - return special.ncfdtr(dfn,dfd,nc,x) - - def _ppf(self, q, dfn, dfd, nc): - return special.ncfdtri(dfn, dfd, nc, q) - - def _munp(self, n, dfn, dfd, nc): - val = (dfn * 1.0/dfd)**n - term = gamln(n+0.5*dfn) + gamln(0.5*dfd-n) - gamln(dfd*0.5) - val *= exp(-nc / 2.0+term) - val *= special.hyp1f1(n+0.5*dfn, 0.5*dfn, 0.5*nc) - return val - - def _stats(self, dfn, dfd, nc): - mu = where(dfd <= 2, inf, dfd / (dfd-2.0)*(1+nc*1.0/dfn)) - mu2 = where(dfd <= 4, inf, 2*(dfd*1.0/dfn)**2.0 * - ((dfn+nc/2.0)**2.0 + (dfn+nc)*(dfd-2.0)) / - ((dfd-2.0)**2.0 * (dfd-4.0))) - return mu, mu2, None, None -ncf = ncf_gen(a=0.0, name='ncf') - - -class t_gen(rv_continuous): - """A Student's T continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `t` is:: - - gamma((df+1)/2) - t.pdf(x, df) = --------------------------------------------------- - sqrt(pi*df) * gamma(df/2) * (1+x**2/df)**((df+1)/2) - - for ``df > 0``. - - %(example)s - - """ - def _rvs(self, df): - return mtrand.standard_t(df, size=self._size) - - def _pdf(self, x, df): - r = asarray(df*1.0) - Px = exp(gamln((r+1)/2)-gamln(r/2)) - Px /= sqrt(r*pi)*(1+(x**2)/r)**((r+1)/2) - return Px - - def _logpdf(self, x, df): - r = df*1.0 - lPx = gamln((r+1)/2)-gamln(r/2) - lPx -= 0.5*log(r*pi) + (r+1)/2*log1p((x**2)/r) - return lPx - - def _cdf(self, x, df): - return special.stdtr(df, x) - - def _sf(self, x, df): - return special.stdtr(df, -x) - - def _ppf(self, q, df): - return special.stdtrit(df, q) - - def _isf(self, q, df): - return -special.stdtrit(df, q) - - def _stats(self, df): - mu2 = where(df > 2, df / (df-2.0), inf) - g1 = where(df > 3, 0.0, nan) - g2 = where(df > 4, 6.0/(df-4.0), nan) - return 0, mu2, g1, g2 -t = t_gen(name='t') - - -class nct_gen(rv_continuous): - """A non-central Student's T continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `nct` is:: - - df**(df/2) * gamma(df+1) - nct.pdf(x, df, nc) = ---------------------------------------------------- - 2**df*exp(nc**2/2) * (df+x**2)**(df/2) * gamma(df/2) - - for ``df > 0``. - - %(example)s - - """ - def _argcheck(self, df, nc): - return (df > 0) & (nc == nc) - - def _rvs(self, df, nc): - return norm.rvs(loc=nc,size=self._size)*sqrt(df) / sqrt(chi2.rvs(df,size=self._size)) - - def _pdf(self, x, df, nc): - n = df*1.0 - nc = nc*1.0 - x2 = x*x - ncx2 = nc*nc*x2 - fac1 = n + x2 - trm1 = n/2.*log(n) + gamln(n+1) - trm1 -= n*log(2)+nc*nc/2.+(n/2.)*log(fac1)+gamln(n/2.) - Px = exp(trm1) - valF = ncx2 / (2*fac1) - trm1 = sqrt(2)*nc*x*special.hyp1f1(n/2+1,1.5,valF) - trm1 /= asarray(fac1*special.gamma((n+1)/2)) - trm2 = special.hyp1f1((n+1)/2,0.5,valF) - trm2 /= asarray(sqrt(fac1)*special.gamma(n/2+1)) - Px *= trm1+trm2 - return Px - - def _cdf(self, x, df, nc): - return special.nctdtr(df, nc, x) - - def _ppf(self, q, df, nc): - return special.nctdtrit(df, nc, q) - - def _stats(self, df, nc, moments='mv'): - mu, mu2, g1, g2 = None, None, None, None - val1 = gam((df-1.0)/2.0) - val2 = gam(df/2.0) - if 'm' in moments: - mu = nc*sqrt(df/2.0)*val1/val2 - if 'v' in moments: - var = (nc*nc+1.0)*df/(df-2.0) - var -= nc*nc*df * val1**2 / 2.0 / val2**2 - mu2 = var - if 's' in moments: - g1n = 2*nc*sqrt(df)*val1*((nc*nc*(2*df-7)-3)*val2**2 - - nc*nc*(df-2)*(df-3)*val1**2) - g1d = (df-3)*sqrt(2*df*(nc*nc+1)/(df-2) - - nc*nc*df*(val1/val2)**2) * val2 * \ - (nc*nc*(df-2)*val1**2 - - 2*(nc*nc+1)*val2**2) - g1 = g1n/g1d - if 'k' in moments: - g2n = 2*(-3*nc**4*(df-2)**2 * (df-3) * (df-4)*val1**4 + - 2**(6-2*df) * nc*nc*(df-2)*(df-4) * - (nc*nc*(2*df-7)-3)*pi*gam(df+1)**2 - - 4*(nc**4*(df-5)-6*nc*nc-3)*(df-3)*val2**4) - g2d = (df-3)*(df-4)*(nc*nc*(df-2)*val1**2 - - 2*(nc*nc+1)*val2)**2 - g2 = g2n / g2d - return mu, mu2, g1, g2 -nct = nct_gen(name="nct") - - -class pareto_gen(rv_continuous): - """A Pareto continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `pareto` is:: - - pareto.pdf(x, b) = b / x**(b+1) - - for ``x >= 1``, ``b > 0``. - - %(example)s - - """ - def _pdf(self, x, b): - return b * x**(-b-1) - - def _cdf(self, x, b): - return 1 - x**(-b) - - def _ppf(self, q, b): - return pow(1-q, -1.0/b) - - def _stats(self, b, moments='mv'): - mu, mu2, g1, g2 = None, None, None, None - if 'm' in moments: - mask = b > 1 - bt = extract(mask,b) - mu = valarray(shape(b),value=inf) - place(mu, mask, bt / (bt-1.0)) - if 'v' in moments: - mask = b > 2 - bt = extract(mask,b) - mu2 = valarray(shape(b), value=inf) - place(mu2, mask, bt / (bt-2.0) / (bt-1.0)**2) - if 's' in moments: - mask = b > 3 - bt = extract(mask,b) - g1 = valarray(shape(b), value=nan) - vals = 2 * (bt + 1.0) * sqrt(bt - 2.0) / ((bt - 3.0) * sqrt(bt)) - place(g1, mask, vals) - if 'k' in moments: - mask = b > 4 - bt = extract(mask,b) - g2 = valarray(shape(b), value=nan) - vals = 6.0*polyval([1.0,1.0,-6,-2],bt) / \ - polyval([1.0,-7.0,12.0,0.0],bt) - place(g2, mask, vals) - return mu, mu2, g1, g2 - - def _entropy(self, c): - return 1 + 1.0/c - log(c) -pareto = pareto_gen(a=1.0, name="pareto") - - -class lomax_gen(rv_continuous): - """A Lomax (Pareto of the second kind) continuous random variable. - - %(before_notes)s - - Notes - ----- - The Lomax distribution is a special case of the Pareto distribution, with - (loc=-1.0). - - The probability density function for `lomax` is:: - - lomax.pdf(x, c) = c / (1+x)**(c+1) - - for ``x >= 0``, ``c > 0``. - - %(example)s - - """ - def _pdf(self, x, c): - return c*1.0/(1.0+x)**(c+1.0) - - def _logpdf(self, x, c): - return log(c) - (c+1)*log1p(x) - - def _cdf(self, x, c): - return 1.0-1.0/(1.0+x)**c - - def _sf(self, x, c): - return 1.0/(1.0+x)**c - - def _logsf(self, x, c): - return -c*log1p(x) - - def _ppf(self, q, c): - return pow(1.0-q,-1.0/c)-1 - - def _stats(self, c): - mu, mu2, g1, g2 = pareto.stats(c, loc=-1.0, moments='mvsk') - return mu, mu2, g1, g2 - - def _entropy(self, c): - return 1+1.0/c-log(c) -lomax = lomax_gen(a=0.0, name="lomax") - - -class pearson3_gen(rv_continuous): - """A pearson type III continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `pearson3` is:: - - pearson3.pdf(x, skew) = abs(beta) / gamma(alpha) * - (beta * (x - zeta))**(alpha - 1) * exp(-beta*(x - zeta)) - - where:: - - beta = 2 / (skew * stddev) - alpha = (stddev * beta)**2 - zeta = loc - alpha / beta - - %(example)s - - References - ---------- - R.W. Vogel and D.E. McMartin, "Probability Plot Goodness-of-Fit and - Skewness Estimation Procedures for the Pearson Type 3 Distribution", Water - Resources Research, Vol.27, 3149-3158 (1991). - - L.R. Salvosa, "Tables of Pearson's Type III Function", Ann. Math. Statist., - Vol.1, 191-198 (1930). - - "Using Modern Computing Tools to Fit the Pearson Type III Distribution to - Aviation Loads Data", Office of Aviation Research (2003). - - """ - def _preprocess(self, x, skew): - # The real 'loc' and 'scale' are handled in the calling pdf(...). The - # local variables 'loc' and 'scale' within pearson3._pdf are set to - # the defaults just to keep them as part of the equations for - # documentation. - loc = 0.0 - scale = 1.0 - - # If skew is small, return _norm_pdf. The divide between pearson3 - # and norm was found by brute force and is approximately a skew of - # 0.000016. No one, I hope, would actually use a skew value even - # close to this small. - norm2pearson_transition = 0.000016 - - ans, x, skew = np.broadcast_arrays([1.0], x, skew) - ans = ans.copy() - - mask = np.absolute(skew) < norm2pearson_transition - invmask = ~mask - - beta = 2.0 / (skew[invmask] * scale) - alpha = (scale * beta)**2 - zeta = loc - alpha / beta - - transx = beta * (x[invmask] - zeta) - return ans, x, transx, skew, mask, invmask, beta, alpha, zeta - - def _argcheck(self, skew): - # The _argcheck function in rv_continuous only allows positive - # arguments. The skew argument for pearson3 can be zero (which I want - # to handle inside pearson3._pdf) or negative. So just return True - # for all skew args. - return np.ones(np.shape(skew), dtype=bool) - - def _stats(self, skew): - ans, x, transx, skew, mask, invmask, beta, alpha, zeta = self._preprocess([1], skew) - m = zeta + alpha / beta - v = alpha / (beta**2) - s = 2.0 / (alpha**0.5) * np.sign(beta) - k = 6.0 / alpha - return m, v, s, k - - def _pdf(self, x, skew): - # Do the calculation in _logpdf since helps to limit - # overflow/underflow problems - ans = exp(self._logpdf(x, skew)) - if ans.ndim == 0: - if np.isnan(ans): - return 0.0 - return ans - ans[np.isnan(ans)] = 0.0 - return ans - - def _logpdf(self, x, skew): - # PEARSON3 logpdf GAMMA logpdf - # np.log(abs(beta)) - # + (alpha - 1)*log(beta*(x - zeta)) + (a - 1)*log(x) - # - beta*(x - zeta) - x - # - gamln(alpha) - gamln(a) - ans, x, transx, skew, mask, invmask, beta, alpha, zeta = self._preprocess(x, skew) - - ans[mask] = np.log(_norm_pdf(x[mask])) - ans[invmask] = log(abs(beta)) + gamma._logpdf(transx, alpha) - return ans - - def _cdf(self, x, skew): - ans, x, transx, skew, mask, invmask, beta, alpha, zeta = self._preprocess(x, skew) - - ans[mask] = _norm_cdf(x[mask]) - ans[invmask] = gamma._cdf(transx, alpha) - return ans - - def _rvs(self, skew): - ans, x, transx, skew, mask, invmask, beta, alpha, zeta = self._preprocess([0], skew) - if mask[0]: - return mtrand.standard_normal(self._size) - ans = mtrand.standard_gamma(alpha, self._size)/beta + zeta - if ans.size == 1: - return ans[0] - return ans - - def _ppf(self, q, skew): - ans, q, transq, skew, mask, invmask, beta, alpha, zeta = self._preprocess(q, skew) - ans[mask] = _norm_ppf(q[mask]) - ans[invmask] = special.gammaincinv(alpha,q[invmask])/beta + zeta - return ans -pearson3 = pearson3_gen(name="pearson3") - - -class powerlaw_gen(rv_continuous): - """A power-function continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `powerlaw` is:: - - powerlaw.pdf(x, a) = a * x**(a-1) - - for ``0 <= x <= 1``, ``a > 0``. - - `powerlaw` is a special case of `beta` with ``d == 1``. - - %(example)s - - """ - def _pdf(self, x, a): - return a*x**(a-1.0) - - def _logpdf(self, x, a): - return log(a) + (a-1)*log(x) - - def _cdf(self, x, a): - return x**(a*1.0) - - def _logcdf(self, x, a): - return a*log(x) - - def _ppf(self, q, a): - return pow(q, 1.0/a) - - def _stats(self, a): - return (a / (a + 1.0), - a / (a + 2.0) / (a + 1.0) ** 2, - -2.0 * ((a - 1.0) / (a + 3.0)) * sqrt((a + 2.0) / a), - 6 * polyval([1, -1, -6, 2], a) / (a * (a + 3.0) * (a + 4))) - - def _entropy(self, a): - return 1 - 1.0/a - log(a) -powerlaw = powerlaw_gen(a=0.0, b=1.0, name="powerlaw") - - -class powerlognorm_gen(rv_continuous): - """A power log-normal continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `powerlognorm` is:: - - powerlognorm.pdf(x, c, s) = c / (x*s) * phi(log(x)/s) * - (Phi(-log(x)/s))**(c-1), - - where ``phi`` is the normal pdf, and ``Phi`` is the normal cdf, - and ``x > 0``, ``s, c > 0``. - - %(example)s - - """ - def _pdf(self, x, c, s): - return c/(x*s)*norm.pdf(log(x)/s)*pow(norm.cdf(-log(x)/s),c*1.0-1.0) - - def _cdf(self, x, c, s): - return 1.0 - pow(norm.cdf(-log(x)/s),c*1.0) - - def _ppf(self, q, c, s): - return exp(-s*norm.ppf(pow(1.0-q,1.0/c))) -powerlognorm = powerlognorm_gen(a=0.0, name="powerlognorm") - - -class powernorm_gen(rv_continuous): - """A power normal continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `powernorm` is:: - - powernorm.pdf(x, c) = c * phi(x) * (Phi(-x))**(c-1) - - where ``phi`` is the normal pdf, and ``Phi`` is the normal cdf, - and ``x > 0``, ``c > 0``. - - %(example)s - - """ - def _pdf(self, x, c): - return c*_norm_pdf(x) * \ - (_norm_cdf(-x)**(c-1.0)) - - def _logpdf(self, x, c): - return log(c) + _norm_logpdf(x) + (c-1)*_norm_logcdf(-x) - - def _cdf(self, x, c): - return 1.0-_norm_cdf(-x)**(c*1.0) - - def _ppf(self, q, c): - return -norm.ppf(pow(1.0-q,1.0/c)) -powernorm = powernorm_gen(name='powernorm') - - -class rdist_gen(rv_continuous): - """An R-distributed continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `rdist` is:: - - rdist.pdf(x, c) = (1-x**2)**(c/2-1) / B(1/2, c/2) - - for ``-1 <= x <= 1``, ``c > 0``. - - %(example)s - - """ - def _pdf(self, x, c): - return np.power((1.0 - x**2), c / 2.0 - 1) / special.beta(0.5, c / 2.0) - - def _cdf(self, x, c): - term1 = x / special.beta(0.5, c / 2.0) - res = 0.5 + term1 * special.hyp2f1(0.5, 1 - c / 2.0, 1.5, x**2) - # There's an issue with hyp2f1, it returns nans near x = +-1, c > 100. - # Use the generic implementation in that case. See gh-1285 for - # background. - if any(np.isnan(res)): - return rv_continuous._cdf(self, x, c) - - return res - - def _munp(self, n, c): - return (1 - (n % 2)) * special.beta((n + 1.0) / 2, c / 2.0) -rdist = rdist_gen(a=-1.0, b=1.0, name="rdist") - - -class rayleigh_gen(rv_continuous): - """A Rayleigh continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `rayleigh` is:: - - rayleigh.pdf(r) = r * exp(-r**2/2) - - for ``x >= 0``. - - `rayleigh` is a special case of `chi` with ``df == 2``. - - %(example)s - - """ - def link(self, x, logSF, phat, ix): - rv_continuous.link.__doc__ - if ix == 1: - return x - phat[0] / sqrt(-2.0 * logSF) - else: - return x - phat[1] * sqrt(-2.0 * logSF) - def _rvs(self): - return chi.rvs(2, size=self._size) - - def _pdf(self, r): - return exp(self._logpdf(r)) - def _logpdf(self, r): - rr2 = r * r / 2.0 - return where(rr2==inf, - rr2 , log(r) - rr2) - def _cdf(self, r): - return - expm1(-r * r / 2.0) - def _sf(self, r): - return exp(-r * r / 2.0) - def _ppf(self, q): - return sqrt(-2 * log1p(-q)) - - def _stats(self): - val = 4 - pi - return np.sqrt(pi/2), val/2, 2*(pi-3)*sqrt(pi)/val**1.5, \ - 6*pi/val-16/val**2 - - def _entropy(self): - return _EULER/2.0 + 1 - 0.5*log(2) -rayleigh = rayleigh_gen(a=0.0, name="rayleigh") - - -class truncrayleigh_gen(rv_continuous): - """A truncated Rayleigh continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `truncrayleigh` is:: - - truncrayleigh.cdf(r) = 1 - exp(-((r+c)**2-c**2)/2) - - for ``x >= 0, c>=0``. - - %(example)s - - """ - def _argcheck(self, c): - return (c>=0) - def link(self, x, logSF, phat, ix): - rv_continuous.link.__doc__ - c = phat[0] - if ix == 2: - return x - phat[1] / (sqrt(c*c - 2 * logSF) - c) - elif ix == 1: - return x - phat[2] * (sqrt(c*c - 2 * logSF) - c) - elif ix==0: - xn = (x - phat[1])/phat[2] - return - 2 * logSF / xn - xn / 2.0 - def _fitstart(self, data, args=None): - if args is None: - args = (0.0,)*self.numargs - return args + self.fit_loc_scale(data, *args) - def _pdf(self, r, c): - rc = r+c - return rc*exp(-(rc*rc-c*c)/2.0) - def _logpdf(self, r, c): - rc = r+c - return log(rc)-(rc*rc-c*c)/2.0 - def _cdf(self, r, c): - rc = r+c - return - expm1(-(rc*rc-c*c)/ 2.0) - def _logsf(self, r, c): - rc = r+c - return -(rc*rc-c*c)/ 2.0 - def _sf(self, r, c): - return exp(self._logsf(r, c)) - def _ppf(self, q, c): - return sqrt(c*c - 2 * log1p(-q)) - c - def _stats(self, c): - # TODO: correct this it is wrong! - val = 4-pi - return np.sqrt(pi/2), val/2, 2*(pi-3)*sqrt(pi)/val**1.5, \ - 6*pi/val-16/val**2 - def _entropy(self, c): - # TODO: correct this it is wrong! - return _EULER/2.0 + 1 - 0.5*log(2) -truncrayleigh = truncrayleigh_gen(a=0.0, name="truncrayleigh", shapes='c') - -# Reciprocal Distribution -class reciprocal_gen(rv_continuous): - """A reciprocal continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `reciprocal` is:: - - reciprocal.pdf(x, a, b) = 1 / (x*log(b/a)) - - for ``a <= x <= b``, ``a, b > 0``. - - %(example)s - - """ - def _argcheck(self, a, b): - self.a = a - self.b = b - self.d = log(b*1.0 / a) - return (a > 0) & (b > 0) & (b > a) - - def _pdf(self, x, a, b): - return 1.0 / (x * self.d) - - def _logpdf(self, x, a, b): - return -log(x) - log(self.d) - - def _cdf(self, x, a, b): - return (log(x)-log(a)) / self.d - - def _ppf(self, q, a, b): - return a*pow(b*1.0/a,q) - - def _munp(self, n, a, b): - return 1.0/self.d / n * (pow(b*1.0,n) - pow(a*1.0,n)) - - def _entropy(self,a,b): - return 0.5*log(a*b)+log(log(b/a)) -reciprocal = reciprocal_gen(name="reciprocal") - - -# FIXME: PPF does not work. -class rice_gen(rv_continuous): - """A Rice continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `rice` is:: - - rice.pdf(x, b) = x * exp(-(x**2+b**2)/2) * I[0](x*b) - - for ``x > 0``, ``b > 0``. - - %(example)s - - """ - def _pdf(self, x, b): - return x*exp(-(x*x+b*b)/2.0)*special.i0(x*b) - - def _logpdf(self, x, b): - return log(x) - (x*x + b*b)/2.0 + log(special.i0(x*b)) - - def _munp(self, n, b): - nd2 = n/2.0 - n1 = 1+nd2 - b2 = b*b/2.0 - return 2.0**(nd2)*exp(-b2)*special.gamma(n1) * \ - special.hyp1f1(n1,1,b2) -rice = rice_gen(a=0.0, name="rice") - - -# FIXME: PPF does not work. -class recipinvgauss_gen(rv_continuous): - """A reciprocal inverse Gaussian continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `recipinvgauss` is:: - - recipinvgauss.pdf(x, mu) = 1/sqrt(2*pi*x) * exp(-(1-mu*x)**2/(2*x*mu**2)) - - for ``x >= 0``. - - %(example)s - - """ - def _rvs(self, mu): - return 1.0/mtrand.wald(mu, 1.0, size=self._size) - - def _pdf(self, x, mu): - return 1.0/sqrt(2*pi*x)*exp(-(1-mu*x)**2.0 / (2*x*mu**2.0)) - - def _logpdf(self, x, mu): - return -(1-mu*x)**2.0 / (2*x*mu**2.0) - 0.5*log(2*pi*x) - - def _cdf(self, x, mu): - trm1 = 1.0/mu - x - trm2 = 1.0/mu + x - isqx = 1.0/sqrt(x) - return 1.0-_norm_cdf(isqx*trm1)-exp(2.0/mu)*_norm_cdf(-isqx*trm2) -recipinvgauss = recipinvgauss_gen(a=0.0, name='recipinvgauss') - - -class semicircular_gen(rv_continuous): - """A semicircular continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `semicircular` is:: - - semicircular.pdf(x) = 2/pi * sqrt(1-x**2) - - for ``-1 <= x <= 1``. - - %(example)s - - """ - def _pdf(self, x): - return 2.0/pi*sqrt(1-x*x) - - def _cdf(self, x): - return 0.5+1.0/pi*(x*sqrt(1-x*x) + arcsin(x)) - - def _stats(self): - return 0, 0.25, 0, -1.0 - - def _entropy(self): - return 0.64472988584940017414 -semicircular = semicircular_gen(a=-1.0, b=1.0, name="semicircular") - - -class triang_gen(rv_continuous): - """A triangular continuous random variable. - - %(before_notes)s - - Notes - ----- - The triangular distribution can be represented with an up-sloping line from - ``loc`` to ``(loc + c*scale)`` and then downsloping for ``(loc + c*scale)`` - to ``(loc+scale)``. - - The standard form is in the range [0, 1] with c the mode. - The location parameter shifts the start to `loc`. - The scale parameter changes the width from 1 to `scale`. - - %(example)s - - """ - def _rvs(self, c): - return mtrand.triangular(0, c, 1, self._size) - - def _argcheck(self, c): - return (c >= 0) & (c <= 1) - - def _pdf(self, x, c): - return where(x < c, 2*x/c, 2*(1-x)/(1-c)) - - def _cdf(self, x, c): - return where(x < c, x*x/c, (x*x-2*x+c)/(c-1)) - - def _ppf(self, q, c): - return where(q < c, sqrt(c*q), 1-sqrt((1-c)*(1-q))) - - def _stats(self, c): - return (c+1.0)/3.0, (1.0-c+c*c)/18, sqrt(2)*(2*c-1)*(c+1)*(c-2) / \ - (5 * np.power((1.0-c+c*c), 1.5)), -3.0/5.0 - - def _entropy(self,c): - return 0.5-log(2) -triang = triang_gen(a=0.0, b=1.0, name="triang") - - -class truncexpon_gen(rv_continuous): - """A truncated exponential continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `truncexpon` is:: - - truncexpon.pdf(x, b) = exp(-x) / (1-exp(-b)) - - for ``0 < x < b``. - - %(example)s - - """ - def _argcheck(self, b): - self.b = b - return (b > 0) - - def _pdf(self, x, b): - return exp(-x) / (-expm1(-b)) - - def _logpdf(self, x, b): - return - x - log(-expm1(-b)) - - def _cdf(self, x, b): - return expm1(-x) / expm1(-b) - - def _ppf(self, q, b): - return - log1p(q * expm1(-b)) - - def _munp(self, n, b): - # wrong answer with formula, same as in continuous.pdf - # return gam(n+1)-special.gammainc(1+n,b) - if n == 1: - return (1-(b+1)*exp(-b))/(-expm1(-b)) - elif n == 2: - return 2*(1-0.5*(b*b+2*b+2)*exp(-b))/(-expm1(-b)) - else: - # return generic for higher moments - # return rv_continuous._mom1_sc(self,n, b) - return self._mom1_sc(n, b) - - def _entropy(self, b): - eB = exp(b) - return log(eB-1)+(1+eB*(b-1.0))/(1.0-eB) -truncexpon = truncexpon_gen(a=0.0, name='truncexpon') - - -class truncnorm_gen(rv_continuous): - """A truncated normal continuous random variable. - - %(before_notes)s - - Notes - ----- - The standard form of this distribution is a standard normal truncated to - the range [a,b] --- notice that a and b are defined over the domain of the - standard normal. To convert clip values for a specific mean and standard - deviation, use:: - - a, b = (myclip_a - my_mean) / my_std, (myclip_b - my_mean) / my_std - - %(example)s - - """ - def _argcheck(self, a, b): - self.a = a - self.b = b - self._nb = _norm_cdf(b) - self._na = _norm_cdf(a) - self._sb = _norm_sf(b) - self._sa = _norm_sf(a) - if self.a > 0: - self._delta = -(self._sb - self._sa) - else: - self._delta = self._nb - self._na - self._logdelta = log(self._delta) - return (a != b) - - def _pdf(self, x, a, b): - return _norm_pdf(x) / self._delta - - def _logpdf(self, x, a, b): - return _norm_logpdf(x) - self._logdelta - - def _cdf(self, x, a, b): - return (_norm_cdf(x) - self._na) / self._delta - - def _ppf(self, q, a, b): - if self.a > 0: - return _norm_isf(q*self._sb + self._sa*(1.0-q)) - else: - return _norm_ppf(q*self._nb + self._na*(1.0-q)) - - def _stats(self, a, b): - nA, nB = self._na, self._nb - d = nB - nA - pA, pB = _norm_pdf(a), _norm_pdf(b) - mu = (pA - pB) / d # correction sign - mu2 = 1 + (a*pA - b*pB) / d - mu*mu - return mu, mu2, None, None -truncnorm = truncnorm_gen(name='truncnorm') - - -# FIXME: RVS does not work. -class tukeylambda_gen(rv_continuous): - """A Tukey-Lamdba continuous random variable. - - %(before_notes)s - - Notes - ----- - A flexible distribution, able to represent and interpolate between the - following distributions: - - - Cauchy (lam=-1) - - logistic (lam=0.0) - - approx Normal (lam=0.14) - - u-shape (lam = 0.5) - - uniform from -1 to 1 (lam = 1) - - %(example)s - - """ - def _argcheck(self, lam): - return np.ones(np.shape(lam), dtype=bool) - - def _pdf(self, x, lam): - Fx = asarray(special.tklmbda(x,lam)) - Px = Fx**(lam-1.0) + (asarray(1-Fx))**(lam-1.0) - Px = 1.0/asarray(Px) - return where((lam <= 0) | (abs(x) < 1.0/asarray(lam)), Px, 0.0) - - def _cdf(self, x, lam): - return special.tklmbda(x, lam) - - def _ppf(self, q, lam): - q = q*1.0 - vals1 = (q**lam - (1-q)**lam)/lam - vals2 = log(q/(1-q)) - return where((lam == 0) & (q == q), vals2, vals1) - - def _stats(self, lam): - return 0, _tlvar(lam), 0, _tlkurt(lam) - - def _entropy(self, lam): - def integ(p): - return log(pow(p,lam-1)+pow(1-p,lam-1)) - return integrate.quad(integ,0,1)[0] -tukeylambda = tukeylambda_gen(name='tukeylambda') - - -class uniform_gen(rv_continuous): - """A uniform continuous random variable. - - This distribution is constant between `loc` and ``loc + scale``. - - %(before_notes)s - - %(example)s - - """ - def _rvs(self): - return mtrand.uniform(0.0,1.0,self._size) - - def _pdf(self, x): - return 1.0*(x == x) - - def _cdf(self, x): - return x - - def _ppf(self, q): - return q - - def _stats(self): - return 0.5, 1.0/12, 0, -1.2 - - def _entropy(self): - return 0.0 -uniform = uniform_gen(a=0.0, b=1.0, name='uniform') - - -class vonmises_gen(rv_continuous): - """A Von Mises continuous random variable. - - %(before_notes)s - - Notes - ----- - If `x` is not in range or `loc` is not in range it assumes they are angles - and converts them to [-pi, pi] equivalents. - - The probability density function for `vonmises` is:: - - vonmises.pdf(x, kappa) = exp(kappa * cos(x)) / (2*pi*I[0](kappa)) - - for ``-pi <= x <= pi``, ``kappa > 0``. - - %(example)s - - """ - def _rvs(self, kappa): - return mtrand.vonmises(0.0, kappa, size=self._size) - - def _pdf(self, x, kappa): - return exp(kappa * cos(x)) / (2*pi*special.i0(kappa)) - - def _cdf(self, x, kappa): - return vonmises_cython.von_mises_cdf(kappa, x) - - def _stats_skip(self, kappa): - return 0, None, 0, None -vonmises = vonmises_gen(name='vonmises') - - -class wald_gen(invgauss_gen): - """A Wald continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `wald` is:: - - wald.pdf(x, a) = 1/sqrt(2*pi*x**3) * exp(-(x-1)**2/(2*x)) - - for ``x > 0``. - - `wald` is a special case of `invgauss` with ``mu == 1``. - - %(example)s - """ - def _rvs(self): - return mtrand.wald(1.0, 1.0, size=self._size) - - def _pdf(self, x): - return invgauss._pdf(x, 1.0) - - def _logpdf(self, x): - return invgauss._logpdf(x, 1.0) - - def _cdf(self, x): - return invgauss._cdf(x, 1.0) - - def _stats(self): - return 1.0, 1.0, 3.0, 15.0 -wald = wald_gen(a=0.0, name="wald") - - -class wrapcauchy_gen(rv_continuous): - """A wrapped Cauchy continuous random variable. - - %(before_notes)s - - Notes - ----- - The probability density function for `wrapcauchy` is:: - - wrapcauchy.pdf(x, c) = (1-c**2) / (2*pi*(1+c**2-2*c*cos(x))) - - for ``0 <= x <= 2*pi``, ``0 < c < 1``. - - %(example)s - - """ - def _argcheck(self, c): - return (c > 0) & (c < 1) - - def _pdf(self, x, c): - return (1.0-c*c)/(2*pi*(1+c*c-2*c*cos(x))) - - def _cdf(self, x, c): - output = 0.0*x - val = (1.0+c)/(1.0-c) - c1 = x < pi - c2 = 1-c1 - xp = extract(c1,x) - xn = extract(c2,x) - if (any(xn)): - valn = extract(c2, np.ones_like(x)*val) - xn = 2*pi - xn - yn = tan(xn/2.0) - on = 1.0-1.0/pi*arctan(valn*yn) - place(output, c2, on) - if (any(xp)): - valp = extract(c1, np.ones_like(x)*val) - yp = tan(xp/2.0) - op = 1.0/pi*arctan(valp*yp) - place(output, c1, op) - return output - - def _ppf(self, q, c): - val = (1.0-c)/(1.0+c) - rcq = 2*arctan(val*tan(pi*q)) - rcmq = 2*pi-2*arctan(val*tan(pi*(1-q))) - return where(q < 1.0/2, rcq, rcmq) - - def _entropy(self, c): - return log(2*pi*(1-c*c)) -wrapcauchy = wrapcauchy_gen(a=0.0, b=2*pi, name='wrapcauchy') - - -# DISCRETE DISTRIBUTIONS - -def entropy(pk, qk=None, base=None): - """Calculate the entropy of a distribution for given probability values. - - If only probabilities `pk` are given, the entropy is calculated as - ``S = -sum(pk * log(pk), axis=0)``. - - If `qk` is not None, then compute a relative entropy (also known as - Kullback-Leibler divergence or Kullback-Leibler distance) - ``S = sum(pk * log(pk / qk), axis=0)``. - - This routine will normalize `pk` and `qk` if they don't sum to 1. - - Parameters - ---------- - pk : sequence - Defines the (discrete) distribution. ``pk[i]`` is the (possibly - unnormalized) probability of event ``i``. - qk : sequence, optional - Sequence against which the relative entropy is computed. Should be in - the same format as `pk`. - base : float, optional - The logarithmic base to use, defaults to ``e`` (natural logarithm). - - Returns - ------- - S : float - The calculated entropy. - - """ - pk = asarray(pk) - pk = 1.0*pk / sum(pk, axis=0) - if qk is None: - vec = special.xlogy(pk, pk) - else: - qk = asarray(qk) - if len(qk) != len(pk): - raise ValueError("qk and pk must have same length.") - qk = 1.0*qk / sum(qk, axis=0) - # If qk is zero anywhere, then unless pk is zero at those places - # too, the relative entropy is infinite. - if any(take(pk, nonzero(qk == 0.0), axis=0) != 0.0, 0): - return inf - vec = -special.xlogy(pk, pk / qk) - S = -sum(vec, axis=0) - if base is not None: - S /= log(base) - return S - - -## Handlers for generic case where xk and pk are given - -def _drv_pmf(self, xk, *args): - try: - return self.P[xk] - except KeyError: - return 0.0 - - -def _drv_cdf(self, xk, *args): - indx = argmax((self.xk > xk),axis=-1)-1 - return self.F[self.xk[indx]] - - -def _drv_ppf(self, q, *args): - indx = argmax((self.qvals >= q),axis=-1) - return self.Finv[self.qvals[indx]] - - -def _drv_nonzero(self, k, *args): - return 1 - - -def _drv_moment(self, n, *args): - n = asarray(n) - return sum(self.xk**n[newaxis,...] * self.pk, axis=0) - - -def _drv_moment_gen(self, t, *args): - t = asarray(t) - return sum(exp(self.xk * t[newaxis,...]) * self.pk, axis=0) - - -def _drv2_moment(self, n, *args): - """Non-central moment of discrete distribution.""" - # many changes, originally not even a return - tot = 0.0 - diff = 1e100 - # pos = self.a - pos = max(0.0, 1.0*self.a) - count = 0 - # handle cases with infinite support - ulimit = max(1000, (min(self.b,1000) + max(self.a,-1000))/2.0) - llimit = min(-1000, (min(self.b,1000) + max(self.a,-1000))/2.0) - - while (pos <= self.b) and ((pos <= ulimit) or - (diff > self.moment_tol)): - diff = np.power(pos, n) * self.pmf(pos,*args) - # use pmf because _pmf does not check support in randint - # and there might be problems ? with correct self.a, self.b at this stage - tot += diff - pos += self.inc - count += 1 - - if self.a < 0: # handle case when self.a = -inf - diff = 1e100 - pos = -self.inc - while (pos >= self.a) and ((pos >= llimit) or - (diff > self.moment_tol)): - diff = np.power(pos, n) * self.pmf(pos,*args) - # using pmf instead of _pmf, see above - tot += diff - pos -= self.inc - count += 1 - return tot - - -def _drv2_ppfsingle(self, q, *args): # Use basic bisection algorithm - b = self.b - a = self.a - if isinf(b): # Be sure ending point is > q - b = int(max(100*q,10)) - while 1: - if b >= self.b: - qb = 1.0 - break - qb = self._cdf(b,*args) - if (qb < q): - b += 10 - else: - break - else: - qb = 1.0 - if isinf(a): # be sure starting point < q - a = int(min(-100*q,-10)) - while 1: - if a <= self.a: - qb = 0.0 - break - qa = self._cdf(a,*args) - if (qa > q): - a -= 10 - else: - break - else: - qa = self._cdf(a, *args) - - while 1: - if (qa == q): - return a - if (qb == q): - return b - if b <= a+1: - # testcase: return wrong number at lower index - # python -c "from scipy.stats import zipf;print zipf.ppf(0.01,2)" wrong - # python -c "from scipy.stats import zipf;print zipf.ppf([0.01,0.61,0.77,0.83],2)" - # python -c "from scipy.stats import logser;print logser.ppf([0.1,0.66, 0.86,0.93],0.6)" - if qa > q: - return a - else: - return b - c = int((a+b)/2.0) - qc = self._cdf(c, *args) - if (qc < q): - if a != c: - a = c - else: - raise RuntimeError('updating stopped, endless loop') - qa = qc - elif (qc > q): - if b != c: - b = c - else: - raise RuntimeError('updating stopped, endless loop') - qb = qc - else: - return c - - -def reverse_dict(dict): - newdict = {} - sorted_keys = list(dict.keys()) - sorted_keys.sort() - for key in sorted_keys[::-1]: - newdict[dict[key]] = key - return newdict - - -def make_dict(keys, values): - d = {} - for key, value in zip(keys, values): - d[key] = value - return d - - -# Must over-ride one of _pmf or _cdf or pass in -# x_k, p(x_k) lists in initialization - -class rv_discrete(rv_generic): - """ - A generic discrete random variable class meant for subclassing. - - `rv_discrete` is a base class to construct specific distribution classes - and instances from for discrete random variables. rv_discrete can be used - to construct an arbitrary distribution with defined by a list of support - points and the corresponding probabilities. - - Parameters - ---------- - a : float, optional - Lower bound of the support of the distribution, default: 0 - b : float, optional - Upper bound of the support of the distribution, default: plus infinity - moment_tol : float, optional - The tolerance for the generic calculation of moments - values : tuple of two array_like - (xk, pk) where xk are points (integers) with positive probability pk - with sum(pk) = 1 - inc : integer - increment for the support of the distribution, default: 1 - other values have not been tested - badvalue : object, optional - The value in (masked) arrays that indicates a value that should be - ignored. - name : str, optional - The name of the instance. This string is used to construct the default - example for distributions. - longname : str, optional - This string is used as part of the first line of the docstring returned - when a subclass has no docstring of its own. Note: `longname` exists - for backwards compatibility, do not use for new subclasses. - shapes : str, optional - The shape of the distribution. For example ``"m, n"`` for a - distribution that takes two integers as the first two arguments for all - its methods. - extradoc : str, optional - This string is used as the last part of the docstring returned when a - subclass has no docstring of its own. Note: `extradoc` exists for - backwards compatibility, do not use for new subclasses. - - Methods - ------- - generic.rvs(, loc=0, size=1) - random variates - - generic.pmf(x, , loc=0) - probability mass function - - logpmf(x, , loc=0) - log of the probability density function - - generic.cdf(x, , loc=0) - cumulative density function - - generic.logcdf(x, , loc=0) - log of the cumulative density function - - generic.sf(x, , loc=0) - survival function (1-cdf --- sometimes more accurate) - - generic.logsf(x, , loc=0, scale=1) - log of the survival function - - generic.ppf(q, , loc=0) - percent point function (inverse of cdf --- percentiles) - - generic.isf(q, , loc=0) - inverse survival function (inverse of sf) - - generic.moment(n, , loc=0) - non-central n-th moment of the distribution. May not work for array arguments. - - generic.stats(, loc=0, moments='mv') - mean('m', axis=0), variance('v'), skew('s'), and/or kurtosis('k') - - generic.entropy(, loc=0) - entropy of the RV - - generic.fit(data, , loc=0) - Parameter estimates for generic data - - generic.expect(func=None, args=(), loc=0, lb=None, ub=None, conditional=False) - Expected value of a function with respect to the distribution. - Additional kwd arguments passed to integrate.quad - - generic.median(, loc=0) - Median of the distribution. - - generic.mean(, loc=0) - Mean of the distribution. - - generic.std(, loc=0) - Standard deviation of the distribution. - - generic.var(, loc=0) - Variance of the distribution. - - generic.interval(alpha, , loc=0) - Interval that with `alpha` percent probability contains a random - realization of this distribution. - - generic(, loc=0) - calling a distribution instance returns a frozen distribution - - Notes - ----- - - You can construct an arbitrary discrete rv where ``P{X=xk} = pk`` - by passing to the rv_discrete initialization method (through the - values=keyword) a tuple of sequences (xk, pk) which describes only those - values of X (xk) that occur with nonzero probability (pk). - - To create a new discrete distribution, we would do the following:: - - class poisson_gen(rv_discrete): - #"Poisson distribution" - def _pmf(self, k, mu): - ... - - and create an instance:: - - poisson = poisson_gen(name="poisson", - longname='A Poisson') - - The docstring can be created from a template. - - Alternatively, the object may be called (as a function) to fix the shape - and location parameters returning a "frozen" discrete RV object:: - - myrv = generic(, loc=0) - - frozen RV object with the same methods but holding the given - shape and location fixed. - - A note on ``shapes``: subclasses need not specify them explicitly. In this - case, the `shapes` will be automatically deduced from the signatures of the - overridden methods. - If, for some reason, you prefer to avoid relying on introspection, you can - specify ``shapes`` explicitly as an argument to the instance constructor. - - - Examples - -------- - - Custom made discrete distribution: - - >>> import matplotlib.pyplot as plt - >>> from scipy import stats - >>> xk = np.arange(7) - >>> pk = (0.1, 0.2, 0.3, 0.1, 0.1, 0.1, 0.1) - >>> custm = stats.rv_discrete(name='custm', values=(xk, pk)) - >>> h = plt.plot(xk, custm.pmf(xk)) - - Random number generation: - - >>> R = custm.rvs(size=100) - - Display frozen pmf: - - >>> numargs = generic.numargs - >>> [ ] = ['Replace with resonable value', ]*numargs - >>> rv = generic() - >>> x = np.arange(0, np.min(rv.dist.b, 3)+1) - >>> h = plt.plot(x, rv.pmf(x)) - - Here, ``rv.dist.b`` is the right endpoint of the support of ``rv.dist``. - - Check accuracy of cdf and ppf: - - >>> prb = generic.cdf(x, ) - >>> h = plt.semilogy(np.abs(x-generic.ppf(prb, ))+1e-20) - - """ - - def __init__(self, a=0, b=inf, name=None, badvalue=None, - moment_tol=1e-8,values=None,inc=1,longname=None, - shapes=None, extradoc=None): - - super(rv_generic,self).__init__() - - if badvalue is None: - badvalue = nan - if name is None: - name = 'Distribution' - self.badvalue = badvalue - self.a = a - self.b = b - self.name = name - self.moment_tol = moment_tol - self.inc = inc - self._cdfvec = vectorize(self._cdfsingle, otypes='d') - self.return_integers = 1 - self.vecentropy = vectorize(self._entropy) - self.shapes = shapes - self.extradoc = extradoc - - if values is not None: - self.xk, self.pk = values - self.return_integers = 0 - indx = argsort(ravel(self.xk)) - self.xk = take(ravel(self.xk),indx, 0) - self.pk = take(ravel(self.pk),indx, 0) - self.a = self.xk[0] - self.b = self.xk[-1] - self.P = make_dict(self.xk, self.pk) - self.qvals = numpy.cumsum(self.pk,axis=0) - self.F = make_dict(self.xk, self.qvals) - self.Finv = reverse_dict(self.F) - self._ppf = instancemethod(vectorize(_drv_ppf, otypes='d'), - self, rv_discrete) - self._pmf = instancemethod(vectorize(_drv_pmf, otypes='d'), - self, rv_discrete) - self._cdf = instancemethod(vectorize(_drv_cdf, otypes='d'), - self, rv_discrete) - self._nonzero = instancemethod(_drv_nonzero, self, rv_discrete) - self.generic_moment = instancemethod(_drv_moment, - self, rv_discrete) - self.moment_gen = instancemethod(_drv_moment_gen, - self, rv_discrete) - self._construct_argparser(names_to_inspect=['_drv_pmf'], - locscale_in='loc=0', - locscale_out='loc, 1') # scale=1 for discrete RVs - else: - self._construct_argparser(names_to_inspect=['_pmf', '_cdf'], - locscale_in='loc=0', - locscale_out='loc, 1') # scale=1 for discrete RVs - - # nin correction needs to be after we know numargs - # correct nin for generic moment vectorization - self.vec_generic_moment = vectorize(_drv2_moment, otypes='d') - self.vec_generic_moment.nin = self.numargs + 2 - self.generic_moment = instancemethod(self.vec_generic_moment, - self, rv_discrete) - - # correct nin for ppf vectorization - _vppf = vectorize(_drv2_ppfsingle, otypes='d') - _vppf.nin = self.numargs + 2 # +1 is for self - self._vecppf = instancemethod(_vppf, - self, rv_discrete) - - # now that self.numargs is defined, we can adjust nin - self._cdfvec.nin = self.numargs + 1 - - # generate docstring for subclass instances - if longname is None: - if name[0] in ['aeiouAEIOU']: - hstr = "An " - else: - hstr = "A " - longname = hstr + name - - if sys.flags.optimize < 2: - # Skip adding docstrings if interpreter is run with -OO - if self.__doc__ is None: - self._construct_default_doc(longname=longname, extradoc=extradoc) - else: - self._construct_doc() - - #discrete RV do not have the scale parameter, remove it - self.__doc__ = self.__doc__.replace('\n scale : array_like, ' - 'optional\n scale parameter (default=1)', '') - - - def _construct_default_doc(self, longname=None, extradoc=None): - """Construct instance docstring from the rv_discrete template.""" - if extradoc is None: - extradoc = '' - if extradoc.startswith('\n\n'): - extradoc = extradoc[2:] - self.__doc__ = ''.join(['%s discrete random variable.' % longname, - '\n\n%(before_notes)s\n', docheaders['notes'], - extradoc, '\n%(example)s']) - self._construct_doc() - - def _construct_doc(self): - """Construct the instance docstring with string substitutions.""" - tempdict = docdict_discrete.copy() - tempdict['name'] = self.name or 'distname' - tempdict['shapes'] = self.shapes or '' - - if self.shapes is None: - # remove shapes from call parameters if there are none - for item in ['callparams', 'default', 'before_notes']: - tempdict[item] = tempdict[item].replace( - "\n%(shapes)s : array_like\n shape parameters", "") - for i in range(2): - if self.shapes is None: - # necessary because we use %(shapes)s in two forms (w w/o ", ") - self.__doc__ = self.__doc__.replace("%(shapes)s, ", "") - self.__doc__ = doccer.docformat(self.__doc__, tempdict) - - def _rvs(self, *args): - return self._ppf(mtrand.random_sample(self._size),*args) - - def _nonzero(self, k, *args): - return floor(k) == k - - def _argcheck(self, *args): - cond = 1 - for arg in args: - cond &= (arg > 0) - return cond - - def _pmf(self, k, *args): - return self._cdf(k,*args) - self._cdf(k-1,*args) - - def _logpmf(self, k, *args): - return log(self._pmf(k, *args)) - - def _cdfsingle(self, k, *args): - m = arange(int(self.a),k+1) - return sum(self._pmf(m,*args),axis=0) - - def _cdf(self, x, *args): - k = floor(x) - return self._cdfvec(k,*args) - - def _logcdf(self, x, *args): - return log(self._cdf(x, *args)) - - def _sf(self, x, *args): - return 1.0-self._cdf(x,*args) - - def _logsf(self, x, *args): - return log(self._sf(x, *args)) - - def _ppf(self, q, *args): - return self._vecppf(q, *args) - - def _isf(self, q, *args): - return self._ppf(1-q,*args) - - def _stats(self, *args): - return None, None, None, None - - def _munp(self, n, *args): - return self.generic_moment(n, *args) - - def rvs(self, *args, **kwargs): - """ - Random variates of given type. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - size : int or tuple of ints, optional - Defining number of random variates (default=1). Note that `size` - has to be given as keyword, not as positional argument. - - Returns - ------- - rvs : ndarray or scalar - Random variates of given `size`. - - """ - kwargs['discrete'] = True - return super(rv_discrete, self).rvs(*args, **kwargs) - - def pmf(self, k,*args, **kwds): - """ - Probability mass function at k of the given RV. - - Parameters - ---------- - k : array_like - quantiles - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : array_like, optional - Location parameter (default=0). - - Returns - ------- - pmf : array_like - Probability mass function evaluated at k - - """ - args, loc, _ = self._parse_args(*args, **kwds) - k,loc = map(asarray,(k,loc)) - args = tuple(map(asarray,args)) - k = asarray((k-loc)) - cond0 = self._argcheck(*args) - cond1 = (k >= self.a) & (k <= self.b) & self._nonzero(k,*args) - cond = cond0 & cond1 - output = zeros(shape(cond),'d') - place(output,(1-cond0) + np.isnan(k),self.badvalue) - if any(cond): - goodargs = argsreduce(cond, *((k,)+args)) - place(output,cond,self._pmf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def logpmf(self, k,*args, **kwds): - """ - Log of the probability mass function at k of the given RV. - - Parameters - ---------- - k : array_like - Quantiles. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter. Default is 0. - - Returns - ------- - logpmf : array_like - Log of the probability mass function evaluated at k. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - k,loc = map(asarray,(k,loc)) - args = tuple(map(asarray,args)) - k = asarray((k-loc)) - cond0 = self._argcheck(*args) - cond1 = (k >= self.a) & (k <= self.b) & self._nonzero(k,*args) - cond = cond0 & cond1 - output = empty(shape(cond),'d') - output.fill(NINF) - place(output,(1-cond0) + np.isnan(k),self.badvalue) - if any(cond): - goodargs = argsreduce(cond, *((k,)+args)) - place(output,cond,self._logpmf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def cdf(self, k, *args, **kwds): - """ - Cumulative distribution function of the given RV. - - Parameters - ---------- - k : array_like, int - Quantiles. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - - Returns - ------- - cdf : ndarray - Cumulative distribution function evaluated at `k`. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - k,loc = map(asarray,(k,loc)) - args = tuple(map(asarray,args)) - k = asarray((k-loc)) - cond0 = self._argcheck(*args) - cond1 = (k >= self.a) & (k < self.b) - cond2 = (k >= self.b) - cond = cond0 & cond1 - output = zeros(shape(cond),'d') - place(output,(1-cond0) + np.isnan(k),self.badvalue) - place(output,cond2*(cond0 == cond0), 1.0) - - if any(cond): - goodargs = argsreduce(cond, *((k,)+args)) - place(output,cond,self._cdf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def logcdf(self, k, *args, **kwds): - """ - Log of the cumulative distribution function at k of the given RV - - Parameters - ---------- - k : array_like, int - Quantiles. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - - Returns - ------- - logcdf : array_like - Log of the cumulative distribution function evaluated at k. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - k,loc = map(asarray,(k,loc)) - args = tuple(map(asarray,args)) - k = asarray((k-loc)) - cond0 = self._argcheck(*args) - cond1 = (k >= self.a) & (k < self.b) - cond2 = (k >= self.b) - cond = cond0 & cond1 - output = empty(shape(cond),'d') - output.fill(NINF) - place(output,(1-cond0) + np.isnan(k),self.badvalue) - place(output,cond2*(cond0 == cond0), 0.0) - - if any(cond): - goodargs = argsreduce(cond, *((k,)+args)) - place(output,cond,self._logcdf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def sf(self,k,*args,**kwds): - """ - Survival function (1-cdf) at k of the given RV. - - Parameters - ---------- - k : array_like - Quantiles. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - - Returns - ------- - sf : array_like - Survival function evaluated at k. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - k,loc = map(asarray,(k,loc)) - args = tuple(map(asarray,args)) - k = asarray(k-loc) - cond0 = self._argcheck(*args) - cond1 = (k >= self.a) & (k <= self.b) - cond2 = (k < self.a) & cond0 - cond = cond0 & cond1 - output = zeros(shape(cond),'d') - place(output,(1-cond0) + np.isnan(k),self.badvalue) - place(output,cond2,1.0) - if any(cond): - goodargs = argsreduce(cond, *((k,)+args)) - place(output,cond,self._sf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def logsf(self,k,*args,**kwds): - """ - Log of the survival function of the given RV. - - Returns the log of the "survival function," defined as ``1 - cdf``, - evaluated at `k`. - - Parameters - ---------- - k : array_like - Quantiles. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - - Returns - ------- - sf : ndarray - Survival function evaluated at `k`. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - k,loc = map(asarray,(k,loc)) - args = tuple(map(asarray,args)) - k = asarray(k-loc) - cond0 = self._argcheck(*args) - cond1 = (k >= self.a) & (k <= self.b) - cond2 = (k < self.a) & cond0 - cond = cond0 & cond1 - output = empty(shape(cond),'d') - output.fill(NINF) - place(output,(1-cond0) + np.isnan(k),self.badvalue) - place(output,cond2,0.0) - if any(cond): - goodargs = argsreduce(cond, *((k,)+args)) - place(output,cond,self._logsf(*goodargs)) - if output.ndim == 0: - return output[()] - return output - - def ppf(self,q,*args,**kwds): - """ - Percent point function (inverse of cdf) at q of the given RV - - Parameters - ---------- - q : array_like - Lower tail probability. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - scale : array_like, optional - Scale parameter (default=1). - - Returns - ------- - k : array_like - Quantile corresponding to the lower tail probability, q. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - q,loc = map(asarray,(q,loc)) - args = tuple(map(asarray,args)) - cond0 = self._argcheck(*args) & (loc == loc) - cond1 = (q > 0) & (q < 1) - cond2 = (q == 1) & cond0 - cond = cond0 & cond1 - output = valarray(shape(cond),value=self.badvalue,typecode='d') - # output type 'd' to handle nin and inf - place(output,(q == 0)*(cond == cond), self.a-1) - place(output,cond2,self.b) - if any(cond): - goodargs = argsreduce(cond, *((q,)+args+(loc,))) - loc, goodargs = goodargs[-1], goodargs[:-1] - place(output,cond,self._ppf(*goodargs) + loc) - - if output.ndim == 0: - return output[()] - return output - - def isf(self,q,*args,**kwds): - """ - Inverse survival function (1-sf) at q of the given RV. - - Parameters - ---------- - q : array_like - Upper tail probability. - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - - Returns - ------- - k : ndarray or scalar - Quantile corresponding to the upper tail probability, q. - - """ - args, loc, _ = self._parse_args(*args, **kwds) - q,loc = map(asarray,(q,loc)) - args = tuple(map(asarray,args)) - cond0 = self._argcheck(*args) & (loc == loc) - cond1 = (q > 0) & (q < 1) - cond2 = (q == 1) & cond0 - cond = cond0 & cond1 - - # same problem as with ppf; copied from ppf and changed - output = valarray(shape(cond),value=self.badvalue,typecode='d') - # output type 'd' to handle nin and inf - place(output,(q == 0)*(cond == cond), self.b) - place(output,cond2,self.a-1) - - # call place only if at least 1 valid argument - if any(cond): - goodargs = argsreduce(cond, *((q,)+args+(loc,))) - loc, goodargs = goodargs[-1], goodargs[:-1] - place(output,cond,self._isf(*goodargs) + loc) # PB same as ticket 766 - - if output.ndim == 0: - return output[()] - return output - - def stats(self, *args, **kwds): - """ - Some statistics of the given discrete RV. - - Parameters - ---------- - arg1, arg2, arg3,... : array_like - The shape parameter(s) for the distribution (see docstring of the - instance object for more information). - loc : array_like, optional - Location parameter (default=0). - moments : string, optional - Composed of letters ['mvsk'] defining which moments to compute: - - - 'm' = mean, - - 'v' = variance, - - 's' = (Fisher's) skew, - - 'k' = (Fisher's) kurtosis. - - The default is'mv'. - - Returns - ------- - stats : sequence - of requested moments. - - """ - try: - kwds["moments"] = kwds.pop("moment") # test suite is full of these; a feature? - except KeyError: - pass - args, loc, _, moments = self._parse_args_stats(*args, **kwds) - loc = asarray(loc) - args = tuple(map(asarray,args)) - cond = self._argcheck(*args) & (loc == loc) - - signature = inspect.getargspec(get_method_function(self._stats)) - if (signature[2] is not None) or ('moments' in signature[0]): - mu, mu2, g1, g2 = self._stats(*args,**{'moments':moments}) - else: - mu, mu2, g1, g2 = self._stats(*args) - if g1 is None: - mu3 = None - else: - mu3 = g1 * np.power(mu2, 1.5) - default = valarray(shape(cond), self.badvalue) - output = [] - - # Use only entries that are valid in calculation - goodargs = argsreduce(cond, *(args+(loc,))) - loc, goodargs = goodargs[-1], goodargs[:-1] - - if 'm' in moments: - if mu is None: - mu = self._munp(1.0,*goodargs) - out0 = default.copy() - place(out0,cond,mu+loc) - output.append(out0) - - if 'v' in moments: - if mu2 is None: - mu2p = self._munp(2.0,*goodargs) - if mu is None: - mu = self._munp(1.0,*goodargs) - mu2 = mu2p - mu*mu - out0 = default.copy() - place(out0,cond,mu2) - output.append(out0) - - if 's' in moments: - if g1 is None: - mu3p = self._munp(3.0,*goodargs) - if mu is None: - mu = self._munp(1.0,*goodargs) - if mu2 is None: - mu2p = self._munp(2.0,*goodargs) - mu2 = mu2p - mu*mu - mu3 = mu3p - 3*mu*mu2 - mu**3 - g1 = mu3 / np.power(mu2, 1.5) - out0 = default.copy() - place(out0,cond,g1) - output.append(out0) - - if 'k' in moments: - if g2 is None: - mu4p = self._munp(4.0,*goodargs) - if mu is None: - mu = self._munp(1.0,*goodargs) - if mu2 is None: - mu2p = self._munp(2.0,*goodargs) - mu2 = mu2p - mu*mu - if mu3 is None: - mu3p = self._munp(3.0,*goodargs) - mu3 = mu3p - 3*mu*mu2 - mu**3 - mu4 = mu4p - 4*mu*mu3 - 6*mu*mu*mu2 - mu**4 - g2 = mu4 / mu2**2.0 - 3.0 - out0 = default.copy() - place(out0,cond,g2) - output.append(out0) - - if len(output) == 1: - return output[0] - else: - return tuple(output) - - def moment(self, n, *args, **kwds): - """ - n'th non-central moment of the distribution - - Parameters - ---------- - n : int, n>=1 - order of moment - arg1, arg2, arg3,... : float - The shape parameter(s) for the distribution (see docstring of the - instance object for more information) - loc : float, optional - location parameter (default=0) - scale : float, optional - scale parameter (default=1) - - """ - loc = kwds.get('loc', 0) - scale = kwds.get('scale', 1) - if not (self._argcheck(*args) and (scale > 0)): - return nan - if (floor(n) != n): - raise ValueError("Moment must be an integer.") - if (n < 0): - raise ValueError("Moment must be positive.") - mu, mu2, g1, g2 = None, None, None, None - if (n > 0) and (n < 5): - signature = inspect.getargspec(get_method_function(self._stats)) - if (signature[2] is not None) or ('moments' in signature[0]): - dict = {'moments':{1:'m',2:'v',3:'vs',4:'vk'}[n]} - else: - dict = {} - mu, mu2, g1, g2 = self._stats(*args,**dict) - val = _moment_from_stats(n, mu, mu2, g1, g2, self._munp, args) - - # Convert to transformed X = L + S*Y - # so E[X^n] = E[(L+S*Y)^n] = L^n sum(comb(n,k)*(S/L)^k E[Y^k],k=0...n) - if loc == 0: - return scale**n * val - else: - result = 0 - fac = float(scale) / float(loc) - for k in range(n): - valk = _moment_from_stats(k, mu, mu2, g1, g2, self._munp, args) - result += comb(n,k,exact=True)*(fac**k) * valk - result += fac**n * val - return result * loc**n - - def freeze(self, *args, **kwds): - return rv_frozen(self, *args, **kwds) - - def _entropy(self, *args): - if hasattr(self,'pk'): - return entropy(self.pk) - else: - mu = int(self.stats(*args, **{'moments':'m'})) - val = self.pmf(mu,*args) - ent = -special.xlogy(val, val) - k = 1 - term = 1.0 - while (abs(term) > eps): - val = self.pmf(mu+k,*args) - term = -special.xlogy(val, val) - val = self.pmf(mu-k,*args) - term -= special.xlogy(val, val) - k += 1 - ent += term - return ent - - def entropy(self, *args, **kwds): - args, loc, _ = self._parse_args(*args, **kwds) - loc = asarray(loc) - args = list(map(asarray,args)) - cond0 = self._argcheck(*args) & (loc == loc) - output = zeros(shape(cond0),'d') - place(output,(1-cond0),self.badvalue) - goodargs = argsreduce(cond0, *args) - # np.vectorize doesn't work when numargs == 0 in numpy 1.5.1 - if self.numargs == 0: - place(output, cond0, self._entropy()) - else: - place(output, cond0, self.vecentropy(*goodargs)) - - return output - - def __call__(self, *args, **kwds): - return self.freeze(*args,**kwds) - - def expect(self, func=None, args=(), loc=0, lb=None, ub=None, conditional=False): - """ - Calculate expected value of a function with respect to the distribution - for discrete distribution - - Parameters - ---------- - fn : function (default: identity mapping) - Function for which sum is calculated. Takes only one argument. - args : tuple - argument (parameters) of the distribution - lb, ub : numbers, optional - lower and upper bound for integration, default is set to the support - of the distribution, lb and ub are inclusive (ul<=k<=ub) - conditional : bool, optional - Default is False. - If true then the expectation is corrected by the conditional - probability of the integration interval. The return value is the - expectation of the function, conditional on being in the given - interval (k such that ul<=k<=ub). - - Returns - ------- - expect : float - Expected value. - - Notes - ----- - * function is not vectorized - * accuracy: uses self.moment_tol as stopping criterium - for heavy tailed distribution e.g. zipf(4), accuracy for - mean, variance in example is only 1e-5, - increasing precision (moment_tol) makes zipf very slow - * suppnmin=100 internal parameter for minimum number of points to evaluate - could be added as keyword parameter, to evaluate functions with - non-monotonic shapes, points include integers in (-suppnmin, suppnmin) - * uses maxcount=1000 limits the number of points that are evaluated - to break loop for infinite sums - (a maximum of suppnmin+1000 positive plus suppnmin+1000 negative - integers are evaluated) - - """ - - # moment_tol = 1e-12 # increase compared to self.moment_tol, - # too slow for only small gain in precision for zipf - - # avoid endless loop with unbound integral, eg. var of zipf(2) - maxcount = 1000 - suppnmin = 100 # minimum number of points to evaluate (+ and -) - - if func is None: - def fun(x): - # loc and args from outer scope - return (x+loc)*self._pmf(x, *args) - else: - def fun(x): - # loc and args from outer scope - return func(x+loc)*self._pmf(x, *args) - # used pmf because _pmf does not check support in randint - # and there might be problems(?) with correct self.a, self.b at this stage - # maybe not anymore, seems to work now with _pmf - - self._argcheck(*args) # (re)generate scalar self.a and self.b - if lb is None: - lb = (self.a) - else: - lb = lb - loc # convert bound for standardized distribution - if ub is None: - ub = (self.b) - else: - ub = ub - loc # convert bound for standardized distribution - if conditional: - if np.isposinf(ub)[()]: - # work around bug: stats.poisson.sf(stats.poisson.b, 2) is nan - invfac = 1 - self.cdf(lb-1,*args) - else: - invfac = 1 - self.cdf(lb-1,*args) - self.sf(ub,*args) - else: - invfac = 1.0 - - tot = 0.0 - low, upp = self._ppf(0.001, *args), self._ppf(0.999, *args) - low = max(min(-suppnmin, low), lb) - upp = min(max(suppnmin, upp), ub) - supp = np.arange(low, upp+1, self.inc) # check limits - # print 'low, upp', low, upp - tot = np.sum(fun(supp)) - diff = 1e100 - pos = upp + self.inc - count = 0 - - # handle cases with infinite support - - while (pos <= ub) and (diff > self.moment_tol) and count <= maxcount: - diff = fun(pos) - tot += diff - pos += self.inc - count += 1 - - if self.a < 0: # handle case when self.a = -inf - diff = 1e100 - pos = low - self.inc - while (pos >= lb) and (diff > self.moment_tol) and count <= maxcount: - diff = fun(pos) - tot += diff - pos -= self.inc - count += 1 - if count > maxcount: - warnings.warn('expect(): sum did not converge', RuntimeWarning) - return tot/invfac - - -class binom_gen(rv_discrete): - """A binomial discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `binom` is:: - - binom.pmf(k) = choose(n,k) * p**k * (1-p)**(n-k) - - for ``k`` in ``{0,1,...,n}``. - - `binom` takes ``n`` and ``p`` as shape parameters. - - %(example)s - - """ - def _rvs(self, n, p): - return mtrand.binomial(n,p,self._size) - - def _argcheck(self, n, p): - self.b = n - return (n >= 0) & (p >= 0) & (p <= 1) - - def _logpmf(self, x, n, p): - """ Return logPMF - - Reference - -------------- - Catherine Loader (2000). - "Fast and Accurate Computation of Binomial Probabilities"; - url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.35.2719" } - """ - logp = where((p==0) & (x==0), 1, log(p)) - log1mp = where((p==1) & (x==n), 1, log1p(-p)) - - PI2 = 2.0 * pi - - yborder = log((x == 0.) * exp(n * log1mp) + (x == n) * exp(n * logp)) - nx = n - x - nq = n * (1. - p) - lc = stirlerr(n) - stirlerr(x) - stirlerr(nx) - bd0(x, n * p) - bd0(nx, nq) - inside = (0. < p) & (p < 1.) & (0. < x) & (x < n) - xnx = where((x == 0) | (x == n), 1.0, x*nx) # avoid division by zero - return where(inside, lc + 0.5 * log(n / (PI2 * xnx)), yborder) - def _pmf(self, x, n, p): - return exp(self._logpmf(x, n, p)) - - def _cdf(self, x, n, p): - k = floor(x) - vals = special.bdtr(k,n,p) - return vals - - def _sf(self, x, n, p): - k = floor(x) - return special.bdtrc(k,n,p) - - def _ppf(self, q, n, p): - vals = ceil(special.bdtrik(q,n,p)) - vals1 = vals-1 - temp = special.bdtr(vals1,n,p) - return where(temp >= q, vals1, vals) - - def _stats(self, n, p): - q = 1.0-p - mu = n * p - var = n * p * q - g1 = (q-p) / sqrt(n*p*q) - g2 = (1.0-6*p*q)/(n*p*q) - return mu, var, g1, g2 - - def _entropy(self, n, p): - k = r_[0:n + 1] - vals = self._pmf(k, n, p) - h = -sum(special.xlogy(vals, vals), axis=0) - return h -binom = binom_gen(name='binom') - - -class bernoulli_gen(binom_gen): - """A Bernoulli discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `bernoulli` is:: - - bernoulli.pmf(k) = 1-p if k = 0 - = p if k = 1 - - for ``k`` in ``{0,1}``. - - `bernoulli` takes ``p`` as shape parameter. - - %(example)s - - """ - def _rvs(self, p): - return binom_gen._rvs(self, 1, p) - - def _argcheck(self, p): - return (p >= 0) & (p <= 1) - - def _logpmf(self, x, p): - return binom._logpmf(x, 1, p) - - def _pmf(self, x, p): - return binom._pmf(x, 1, p) - - def _cdf(self, x, p): - return binom._cdf(x, 1, p) - - def _sf(self, x, p): - return binom._sf(x, 1, p) - - def _ppf(self, q, p): - return binom._ppf(q, 1, p) - - def _stats(self, p): - return binom._stats(1, p) - - def _entropy(self, p): - h = -special.xlogy(p, p) - special.xlogy(1 - p, 1 - p) - return h -bernoulli = bernoulli_gen(b=1,name='bernoulli') - - -class nbinom_gen(rv_discrete): - """A negative binomial discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `nbinom` is:: - - nbinom.pmf(k) = choose(k+n-1, n-1) * p**n * (1-p)**k - - for ``k >= 0``. - - `nbinom` takes ``n`` and ``p`` as shape parameters. - - %(example)s - - """ - def _rvs(self, n, p): - return mtrand.negative_binomial(n, p, self._size) - - def _argcheck(self, n, p): - return (n >= 0) & (p >= 0) & (p <= 1) - - def _pmf(self, x, n, p): - return exp(self._logpmf(x, n, p)) - - def _logpmf(self, x, n, p): - coeff = gamln(n+x) - gamln(x+1) - gamln(n) - return coeff + n*log(p) + x*log1p(-p) - - def _cdf(self, x, n, p): - k = floor(x) - return special.betainc(n, k+1, p) - - def _sf_skip(self, x, n, p): - # skip because special.nbdtrc doesn't work for 0= q, vals1, vals) - - def _stats(self, n, p): - Q = 1.0 / p - P = Q - 1.0 - mu = n*P - var = n*P*Q - g1 = (Q+P)/sqrt(n*P*Q) - g2 = (1.0 + 6*P*Q) / (n*P*Q) - return mu, var, g1, g2 -nbinom = nbinom_gen(name='nbinom') - - -class geom_gen(rv_discrete): - """A geometric discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `geom` is:: - - geom.pmf(k) = (1-p)**(k-1)*p - - for ``k >= 1``. - - `geom` takes ``p`` as shape parameter. - - %(example)s - - """ - def _rvs(self, p): - return mtrand.geometric(p, size=self._size) - - def _argcheck(self, p): - return (p <= 1) & (p >= 0) - - def _pmf(self, k, p): - return (1-p)**(k-1) * p - - def _logpmf(self, k, p): - return (k-1)*log1p(-p) + log(p) - - def _cdf(self, x, p): - k = floor(x) - return (1.0-(1.0-p)**k) - - def _sf(self, x, p): - k = floor(x) - return (1.0-p)**k - - def _ppf(self, q, p): - vals = ceil(log1p(-q)/log1p(-p)) - temp = 1.0-(1.0-p)**(vals-1) - return where((temp >= q) & (vals > 0), vals-1, vals) - - def _stats(self, p): - mu = 1.0/p - qr = 1.0-p - var = qr / p / p - g1 = (2.0-p) / sqrt(qr) - g2 = numpy.polyval([1,-6,6],p)/(1.0-p) - return mu, var, g1, g2 -geom = geom_gen(a=1,name='geom', longname="A geometric") - - -class hypergeom_gen(rv_discrete): - """A hypergeometric discrete random variable. - - The hypergeometric distribution models drawing objects from a bin. - M is the total number of objects, n is total number of Type I objects. - The random variate represents the number of Type I objects in N drawn - without replacement from the total population. - - %(before_notes)s - - Notes - ----- - The probability mass function is defined as:: - - pmf(k, M, n, N) = choose(n, k) * choose(M - n, N - k) / choose(M, N), - for N - (M-n) <= k <= min(m,N) - - Examples - -------- - >>> from scipy.stats import hypergeom - - Suppose we have a collection of 20 animals, of which 7 are dogs. Then if - we want to know the probability of finding a given number of dogs if we - choose at random 12 of the 20 animals, we can initialize a frozen - distribution and plot the probability mass function: - - >>> [M, n, N] = [20, 7, 12] - >>> rv = hypergeom(M, n, N) - >>> x = np.arange(0, n+1) - >>> pmf_dogs = rv.pmf(x) - - >>> fig = plt.figure() - >>> ax = fig.add_subplot(111) - >>> ax.plot(x, pmf_dogs, 'bo') - >>> ax.vlines(x, 0, pmf_dogs, lw=2) - >>> ax.set_xlabel('# of dogs in our group of chosen animals') - >>> ax.set_ylabel('hypergeom PMF') - >>> plt.show() - - Instead of using a frozen distribution we can also use `hypergeom` - methods directly. To for example obtain the cumulative distribution - function, use: - - >>> prb = hypergeom.cdf(x, M, n, N) - - And to generate random numbers: - - >>> R = hypergeom.rvs(M, n, N, size=10) - - """ - def _rvs(self, M, n, N): - return mtrand.hypergeometric(n,M-n,N,size=self._size) - - def _argcheck(self, M, n, N): - cond = rv_discrete._argcheck(self,M,n,N) - cond &= (n <= M) & (N <= M) - self.a = max(N-(M-n), 0) - self.b = min(n,N) - return cond - - def _logpmf(self, k, M, n, N): - tot, good = M, n - bad = tot - good - return gamln(good+1) - gamln(good-k+1) - gamln(k+1) + gamln(bad+1) \ - - gamln(bad-N+k+1) - gamln(N-k+1) - gamln(tot+1) + gamln(tot-N+1) \ - + gamln(N+1) - - def _pmf(self, k, M, n, N): - # same as the following but numerically more precise - # return comb(good,k) * comb(bad,N-k) / comb(tot,N) - return exp(self._logpmf(k, M, n, N)) - - def _stats(self, M, n, N): - tot, good = M, n - n = good*1.0 - m = (tot-good)*1.0 - N = N*1.0 - tot = m+n - p = n/tot - mu = N*p - var = m*n*N*(tot-N)*1.0/(tot*tot*(tot-1)) - g1 = (m - n)*(tot-2*N) / (tot-2.0)*sqrt((tot-1.0)/(m*n*N*(tot-N))) - m2, m3, m4, m5 = m**2, m**3, m**4, m**5 - n2, n3, n4, n5 = n**2, n**2, n**4, n**5 - g2 = m3 - m5 + n*(3*m2-6*m3+m4) + 3*m*n2 - 12*m2*n2 + 8*m3*n2 + n3 \ - - 6*m*n3 + 8*m2*n3 + m*n4 - n5 - 6*m3*N + 6*m4*N + 18*m2*n*N \ - - 6*m3*n*N + 18*m*n2*N - 24*m2*n2*N - 6*n3*N - 6*m*n3*N \ - + 6*n4*N + N*N*(6*m2 - 6*m3 - 24*m*n + 12*m2*n + 6*n2 + - 12*m*n2 - 6*n3) - return mu, var, g1, g2 - - def _entropy(self, M, n, N): - k = r_[N - (M - n):min(n, N) + 1] - vals = self.pmf(k, M, n, N) - h = -sum(special.xlogy(vals, vals), axis=0) - return h - - def _sf(self, k, M, n, N): - """More precise calculation, 1 - cdf doesn't cut it.""" - # This for loop is needed because `k` can be an array. If that's the - # case, the sf() method makes M, n and N arrays of the same shape. We - # therefore unpack all inputs args, so we can do the manual integration. - res = [] - for quant, tot, good, draw in zip(k, M, n, N): - # Manual integration over probability mass function. More accurate - # than integrate.quad. - k2 = np.arange(quant + 1, draw + 1) - res.append(np.sum(self._pmf(k2, tot, good, draw))) - return np.asarray(res) -hypergeom = hypergeom_gen(name='hypergeom') - - -# FIXME: Fails _cdfvec -class logser_gen(rv_discrete): - """A Logarithmic (Log-Series, Series) discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `logser` is:: - - logser.pmf(k) = - p**k / (k*log(1-p)) - - for ``k >= 1``. - - `logser` takes ``p`` as shape parameter. - - %(example)s - - """ - def _rvs(self, p): - # looks wrong for p>0.5, too few k=1 - # trying to use generic is worse, no k=1 at all - return mtrand.logseries(p, size=self._size) - - def _argcheck(self, p): - return (p > 0) & (p < 1) - - def _pmf(self, k, p): - return -p**k * 1.0 / k / log(1 - p) - - def _stats(self, p): - r = log1p(-p) - mu = p / (p - 1.0) / r - mu2p = -p / r / (p - 1.0)**2 - var = mu2p - mu*mu - mu3p = -p / r * (1.0+p) / (1.0 - p)**3 - mu3 = mu3p - 3*mu*mu2p + 2*mu**3 - g1 = mu3 / var**1.5 - - mu4p = -p / r * (1.0 / (p-1)**2 - 6*p / (p - 1)**3 + - 6*p*p / (p-1)**4) - mu4 = mu4p - 4*mu3p*mu + 6*mu2p*mu*mu - 3*mu**4 - g2 = mu4 / var**2 - 3.0 - return mu, var, g1, g2 -logser = logser_gen(a=1,name='logser', longname='A logarithmic') - - -class poisson_gen(rv_discrete): - """A Poisson discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `poisson` is:: - - poisson.pmf(k) = exp(-mu) * mu**k / k! - - for ``k >= 0``. - - `poisson` takes ``mu`` as shape parameter. - - %(example)s - - """ - def _rvs(self, mu): - return mtrand.poisson(mu, self._size) - - def _logpmf(self, k, mu): - Pk = k*log(mu)-gamln(k+1) - mu - return Pk - - def _pmf(self, k, mu): - return exp(self._logpmf(k, mu)) - - def _cdf(self, x, mu): - k = floor(x) - return special.pdtr(k,mu) - - def _sf(self, x, mu): - k = floor(x) - return special.pdtrc(k,mu) - - def _ppf(self, q, mu): - vals = ceil(special.pdtrik(q,mu)) - vals1 = vals-1 - temp = special.pdtr(vals1,mu) - return where((temp >= q), vals1, vals) - - def _stats(self, mu): - var = mu - tmp = asarray(mu) - g1 = 1.0 / tmp - g2 = 1.0 / tmp - return mu, var, g1, g2 -poisson = poisson_gen(name="poisson", longname='A Poisson') - - -class planck_gen(rv_discrete): - """A Planck discrete exponential random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `planck` is:: - - planck.pmf(k) = (1-exp(-lambda_))*exp(-lambda_*k) - - for ``k*lambda_ >= 0``. - - `planck` takes ``lambda_`` as shape parameter. - - %(example)s - - """ - def _argcheck(self, lambda_): - if (lambda_ > 0): - self.a = 0 - self.b = inf - return 1 - elif (lambda_ < 0): - self.a = -inf - self.b = 0 - return 1 - else: - return 0 - - def _pmf(self, k, lambda_): - fact = -expm1(-lambda_) - return fact * exp(-lambda_ * k) - - def _cdf(self, x, lambda_): - k = floor(x) - return - expm1(-lambda_ * (k + 1)) - - def _ppf(self, q, lambda_): - vals = ceil(-1.0/lambda_ * log1p(-q)-1) - vals1 = (vals-1).clip(self.a, np.inf) - temp = self._cdf(vals1, lambda_) - return where(temp >= q, vals1, vals) - - def _stats(self, lambda_): - mu = 1/(exp(lambda_)-1) - var = exp(-lambda_)/(expm1(-lambda_))**2 - g1 = 2*cosh(lambda_/2.0) - g2 = 4+2*cosh(lambda_) - return mu, var, g1, g2 - - def _entropy(self, lambda_): - l = lambda_ - C = -expm1(-l) - return l * exp(-l) / C - log(C) -planck = planck_gen(name='planck',longname='A discrete exponential ') - - -class boltzmann_gen(rv_discrete): - """A Boltzmann (Truncated Discrete Exponential) random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `boltzmann` is:: - - boltzmann.pmf(k) = (1-exp(-lambda_)*exp(-lambda_*k)/(1-exp(-lambda_*N)) - - for ``k = 0,...,N-1``. - - `boltzmann` takes ``lambda_`` and ``N`` as shape parameters. - - %(example)s - - """ - def _pmf(self, k, lambda_, N): - fact = (expm1(-lambda_))/(expm1(-lambda_*N)) - return fact*exp(-lambda_*k) - - def _cdf(self, x, lambda_, N): - k = floor(x) - return (expm1(-lambda_*(k+1)))/(expm1(-lambda_*N)) - - def _ppf(self, q, lambda_, N): - qnew = -q*(expm1(-lambda_*N)) - vals = ceil(-1.0/lambda_ * log1p(-qnew)-1) - vals1 = (vals-1).clip(0.0, np.inf) - temp = self._cdf(vals1, lambda_, N) - return where(temp >= q, vals1, vals) - - def _stats(self, lambda_, N): - z = exp(-lambda_) - zN = exp(-lambda_*N) - mu = z/(1.0-z)-N*zN/(1-zN) - var = z/(1.0-z)**2 - N*N*zN/(1-zN)**2 - trm = (1-zN)/(1-z) - trm2 = (z*trm**2 - N*N*zN) - g1 = z*(1+z)*trm**3 - N**3*zN*(1+zN) - g1 = g1 / trm2**(1.5) - g2 = z*(1+4*z+z*z)*trm**4 - N**4 * zN*(1+4*zN+zN*zN) - g2 = g2 / trm2 / trm2 - return mu, var, g1, g2 -boltzmann = boltzmann_gen(name='boltzmann', - longname='A truncated discrete exponential ') - - -class randint_gen(rv_discrete): - """A uniform discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `randint` is:: - - randint.pmf(k) = 1./(max- min) - - for ``k = min,...,max``. - - `randint` takes ``min`` and ``max`` as shape parameters. - - %(example)s - - """ - def _argcheck(self, min, max): - self.a = min - self.b = max-1 - return (max > min) - - def _pmf(self, k, min, max): - fact = 1.0 / (max - min) - return fact - - def _cdf(self, x, min, max): - k = floor(x) - return (k-min+1)*1.0/(max-min) - - def _ppf(self, q, min, max): - vals = ceil(q*(max-min)+min)-1 - vals1 = (vals-1).clip(min, max) - temp = self._cdf(vals1, min, max) - return where(temp >= q, vals1, vals) - - def _stats(self, min, max): - m2, m1 = asarray(max), asarray(min) - mu = (m2 + m1 - 1.0) / 2 - d = m2 - m1 - var = (d-1)*(d+1.0)/12.0 - g1 = 0.0 - g2 = -6.0/5.0*(d*d+1.0)/(d-1.0)*(d+1.0) - return mu, var, g1, g2 - - def _rvs(self, min, max=None): - """An array of *size* random integers >= min and < max. - - If max is None, then range is >=0 and < min - """ - return mtrand.randint(min, max, self._size) - - def _entropy(self, min, max): - return log(max-min) -randint = randint_gen(name='randint',longname='A discrete uniform ' - '(random integer)') - - -# FIXME: problems sampling. -class zipf_gen(rv_discrete): - """A Zipf discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `zipf` is:: - - zipf.pmf(k) = 1/(zeta(a)*k**a) - - for ``k >= 1``. - - `zipf` takes ``a`` as shape parameter. - - %(example)s - - """ - def _rvs(self, a): - return mtrand.zipf(a, size=self._size) - - def _argcheck(self, a): - return a > 1 - - def _pmf(self, k, a): - Pk = 1.0 / asarray(special.zeta(a,1) * k**a) - return Pk - - def _munp(self, n, a): - return special.zeta(a-n,1) / special.zeta(a,1) - - def _stats(self, a): - sv = special.errprint(0) - fac = asarray(special.zeta(a,1)) - mu = special.zeta(a-1.0,1)/fac - mu2p = special.zeta(a-2.0,1)/fac - var = mu2p - mu*mu - mu3p = special.zeta(a-3.0,1)/fac - mu3 = mu3p - 3*mu*mu2p + 2*mu**3 - g1 = mu3 / asarray(var**1.5) - - mu4p = special.zeta(a-4.0,1)/fac - sv = special.errprint(sv) - mu4 = mu4p - 4*mu3p*mu + 6*mu2p*mu*mu - 3*mu**4 - g2 = mu4 / asarray(var**2) - 3.0 - return mu, var, g1, g2 -zipf = zipf_gen(a=1,name='zipf', longname='A Zipf') - - -class dlaplace_gen(rv_discrete): - """A Laplacian discrete random variable. - - %(before_notes)s - - Notes - ----- - The probability mass function for `dlaplace` is:: - - dlaplace.pmf(k) = tanh(a/2) * exp(-a*abs(k)) - - for ``a >0``. - - `dlaplace` takes ``a`` as shape parameter. - - %(example)s - - """ - def _pmf(self, k, a): - return tanh(a/2.0)*exp(-a*abs(k)) - - def _cdf(self, x, a): - k = floor(x) - ind = (k >= 0) - const = exp(a)+1 - return where(ind, 1.0-exp(-a*k)/const, exp(a*(k+1))/const) - - def _ppf(self, q, a): - const = 1.0/(1+exp(-a)) - cons2 = 1+exp(a) - ind = q < const - vals = ceil(where(ind, log(q*cons2)/a-1, -log((1-q)*cons2)/a)) - vals1 = (vals-1) - temp = self._cdf(vals1, a) - return where(temp >= q, vals1, vals) - - def _stats(self, a): - ea = exp(a) - mu2 = 2.*ea/(ea-1.)**2 - mu4 = 2.*ea*(ea**2+10.*ea+1.) / (ea-1.)**4 - return 0., mu2, 0., mu4/mu2**2 - 3. - - def _entropy(self, a): - return a / sinh(a) - log(tanh(a/2.0)) -dlaplace = dlaplace_gen(a=-inf, - name='dlaplace', longname='A discrete Laplacian') - - -class skellam_gen(rv_discrete): - """A Skellam discrete random variable. - - %(before_notes)s - - Notes - ----- - Probability distribution of the difference of two correlated or - uncorrelated Poisson random variables. - - Let k1 and k2 be two Poisson-distributed r.v. with expected values - lam1 and lam2. Then, ``k1 - k2`` follows a Skellam distribution with - parameters ``mu1 = lam1 - rho*sqrt(lam1*lam2)`` and - ``mu2 = lam2 - rho*sqrt(lam1*lam2)``, where rho is the correlation - coefficient between k1 and k2. If the two Poisson-distributed r.v. - are independent then ``rho = 0``. - - Parameters mu1 and mu2 must be strictly positive. - - For details see: http://en.wikipedia.org/wiki/Skellam_distribution - - `skellam` takes ``mu1`` and ``mu2`` as shape parameters. - - %(example)s - - """ - def _rvs(self, mu1, mu2): - n = self._size - return np.random.poisson(mu1, n)-np.random.poisson(mu2, n) - - def _pmf(self, x, mu1, mu2): - px = np.where(x < 0, ncx2.pdf(2*mu2, 2*(1-x), 2*mu1)*2, - ncx2.pdf(2*mu1, 2*(x+1), 2*mu2)*2) - # ncx2.pdf() returns nan's for extremely low probabilities - return px - - def _cdf(self, x, mu1, mu2): - x = np.floor(x) - px = np.where(x < 0, ncx2.cdf(2*mu2, -2*x, 2*mu1), - 1-ncx2.cdf(2*mu1, 2*(x+1), 2*mu2)) - return px - - def _stats(self, mu1, mu2): - mean = mu1 - mu2 - var = mu1 + mu2 - g1 = mean / np.sqrt((var)**3) - g2 = 1 / var - return mean, var, g1, g2 -skellam = skellam_gen(a=-np.inf, name="skellam", longname='A Skellam') - - - - -def test_lognorm(): - lognorm.cdf(np.nan, 0.5, loc=3, scale=1) - lognorm.ppf(np.nan, 0.5, loc=3, scale=1) - lognorm.cdf(lognorm.ppf([-0.5,0,1e-4,0.5,1-1e-4,1,2],0.5,loc=3, - scale=1),0.5,loc=3,scale=1) - -def test_truncrayleigh(): - import matplotlib.pyplot as plt - from wafo.stats import truncrayleigh - numargs = truncrayleigh.numargs - [ c ] = [0.9,] * numargs - rv = truncrayleigh(c) - -#Display frozen pdf - - x = np.linspace(0, np.minimum(rv.dist.b, 3)) - h = plt.plot(x, rv.pdf(x)) - -#Check accuracy of cdf and ppf - - prb = truncrayleigh.cdf(x, c) - h = plt.semilogy(np.abs(x - truncrayleigh.ppf(prb, c)) + 1e-20) - -#Random number generation - - R = truncrayleigh.rvs(c, size=100) - -#Compare ML and MPS method - phat = truncrayleigh.fit2(R, method='ml'); - -def test_docstrings(): - import doctest - doctest.testmod() - -def test_script(): - import matplotlib - matplotlib.interactive(True) - R = norm.rvs(size=100) - phat = norm.fit(R) - - phat = genpareto.fit(R[R > 0.7], f0=0.1, floc=0.7) - - #nbinom(10, 0.75).rvs(3) - t = bernoulli(0.75).rvs(3) - x = np.r_[5, 10] - npr = np.r_[9, 9] - t2 = bd0(x, npr) - #Examples MLE and better CI for phat.par[0] - R = weibull_min.rvs(1, size=100); - phat = weibull_min.fit(R, 1, 1, par_fix=[nan, 0, nan]) - Lp = phat.profile(i=0) - Lp.plot() - Lp.get_bounds(alpha=0.1) - R = 1. / 990 - x = phat.isf(R) - - # CI for x - Lx = phat.profile(i=0, x=x) - Lx.plot() - Lx.get_bounds(alpha=0.2) - - # CI for logSF=log(SF) - Lpr = phat.profile(i=1, logSF=log(R), link=phat.dist.link) - Lpr.plot() - Lpr.get_bounds(alpha=0.075) - - dlaplace.stats(0.8, loc=0) -# pass - t = planck(0.51000000000000001) - t.ppf(0.5) - t = zipf(2) - t.ppf(0.5) - import pylab as plb - rice.rvs(1) - x = plb.linspace(-5, 5) - y = genpareto.cdf(x, 0) - #plb.plot(x,y) - #plb.show() - - - on = ones((2, 3)) - r = genpareto.rvs(0, size=100) - pht = genpareto.fit(r, 1, par_fix=[0, 0, nan]) - lp = pht.profile() - -def test_binom(): - val = binom(100,1) - print(val.pmf(100)) - print(binom.pmf(100,100,1)) - a = poisson(0) - print(a.pmf(0)) - pass -def test_genpareto(): - - numargs = genpareto.numargs - [ c ] = [0.9,] * numargs - rv = genpareto(c) - R = genpareto.rvs(c, size=100) - - phat = genpareto.fit2(R, floc=0, fscale=1, method='mps') - print(phat.par) - -if __name__ == '__main__': - rv = reciprocal(0.2836212578535795, 1.2836212578535795) - vals = rv.ppf([0,1]) - rv = genextreme(1.9556140678015677) - vals = rv.ppf([0,1]) - - import matplotlib - #matplotlib.interactive=False - import matplotlib.pyplot as plt - plt.ioff() - aa = rice.pdf(1.0, 0.0) - a = beta.logpdf(0,1,0.5) - b = beta.logpdf(0,0.5,1) - c = gamma._logpdf(0,1) - x = np.linspace(0,5,40) - plt.plot(x, gamma.logpdf(x,1)) - plt.show() - pass - - prb = np.linspace(0,1, 10) - q = truncnorm.isf(prb,-1., 1., loc=[3],scale=2) - plt.plot(q, prb) - plt.show() - p = truncnorm.sf(q,-1,1, loc=[3],scale=2) - pass - #bernoulli.logcdf(np.nan) - #test_binom() - #test_doctstrings() - #test_genpareto() - #test_truncrayleigh() - #test_lognorm() - +# +# Author: Travis Oliphant 2002-2011 with contributions from +# SciPy Developers 2004-2011 +# +# NOTE: To look at history using `git blame`, use `git blame -M -C -C` +# instead of `git blame -Lxxx,+x`. +# +from __future__ import division, print_function, absolute_import + +from ._distn_infrastructure import entropy, rv_discrete, rv_continuous + +from ._continuous_distns import * +from ._discrete_distns import * diff --git a/pywafo/src/wafo/stats/estimation.py b/pywafo/src/wafo/stats/estimation.py index 61e0438..8a29501 100644 --- a/pywafo/src/wafo/stats/estimation.py +++ b/pywafo/src/wafo/stats/estimation.py @@ -1,7 +1,8 @@ -''' +''' Contains FitDistribution and Profile class, which are -important classes for fitting to various Continous and Discrete Probability Distributions +important classes for fitting to various Continous and Discrete Probability +Distributions Author: Per A. Brodtkorb 2008 ''' @@ -12,7 +13,7 @@ from wafo.plotbackend import plotbackend from wafo.misc import ecross, findcross -import numdifftools #@UnresolvedImport +import numdifftools # @UnresolvedImport from scipy import special from scipy.linalg import pinv2 from scipy import optimize @@ -20,34 +21,39 @@ from scipy import optimize import numpy import numpy as np from numpy import alltrue, arange, ravel, sum, zeros, log, sqrt, exp -from numpy import (atleast_1d, any, asarray, nan, pi, #reshape, #repeat, product, ndarray, - isfinite) +from numpy import ( + atleast_1d, any, asarray, nan, pi, # reshape, #repeat, product, ndarray, + isfinite) from numpy import flatnonzero as nonzero __all__ = [ - 'Profile', 'FitDistribution' - ] + 'Profile', 'FitDistribution' +] floatinfo = np.finfo(float) -#arr = atleast_1d +# arr = atleast_1d arr = asarray -all = alltrue #@ReservedAssignment +all = alltrue # @ReservedAssignment + def chi2isf(p, df): return special.chdtri(df, p) + def chi2sf(x, df): return special.chdtrc(df, x) + def norm_ppf(q): return special.ndtri(q) # Frozen RV class class rv_frozen(object): + ''' Frozen continous or discrete 1D Random Variable object (RV) - + Methods ------- rvs(size=1) @@ -67,91 +73,106 @@ class rv_frozen(object): entropy() (Differential) entropy of the RV. ''' + def __init__(self, dist, *args, **kwds): self.dist = dist - loc0, scale0 = map(kwds.get, ['loc', 'scale']) - if hasattr(dist, 'fix_loc_scale'): #isinstance(dist, rv_continuous): - args, loc0, scale0 = dist.fix_loc_scale(args, loc0, scale0) - self.par = args + (loc0, scale0) - else: # rv_discrete - args, loc0 = dist.fix_loc(args, loc0) - self.par = args + (loc0,) + args, loc, scale = dist._parse_args(*args, **kwds) + if len(args) == dist.numargs - 2: # isinstance(dist, rv_continuous): + self.par = args + (loc, scale) + else: # rv_discrete + self.par = args + (loc,) def pdf(self, x): ''' Probability density function at x of the given RV.''' return self.dist.pdf(x, *self.par) + def cdf(self, x): '''Cumulative distribution function at x of the given RV.''' return self.dist.cdf(x, *self.par) + def ppf(self, q): '''Percent point function (inverse of cdf) at q of the given RV.''' return self.dist.ppf(q, *self.par) + def isf(self, q): '''Inverse survival function at q of the given RV.''' return self.dist.isf(q, *self.par) + def rvs(self, size=None): '''Random variates of given type.''' kwds = dict(size=size) return self.dist.rvs(*self.par, **kwds) + def sf(self, x): '''Survival function (1-cdf) at x of the given RV.''' return self.dist.sf(x, *self.par) + def stats(self, moments='mv'): ''' Some statistics of the given RV''' kwds = dict(moments=moments) return self.dist.stats(*self.par, **kwds) + def median(self): return self.dist.median(*self.par) + def mean(self): return self.dist.mean(*self.par) + def var(self): return self.dist.var(*self.par) + def std(self): return self.dist.std(*self.par) + def moment(self, n): par1 = self.par[:self.dist.numargs] return self.dist.moment(n, *par1) + def entropy(self): return self.dist.entropy(*self.par) + def pmf(self, k): '''Probability mass function at k of the given RV''' return self.dist.pmf(k, *self.par) - def interval(self,alpha): + + def interval(self, alpha): return self.dist.interval(alpha, *self.par) # internal class to profile parameters of a given distribution class Profile(object): + ''' Profile Log- likelihood or Product Spacing-function. which can be used for constructing confidence interval for either phat[i], probability or quantile. - + Parameters ---------- - fit_dist : FitDistribution object + fit_dist : FitDistribution object with ML or MPS estimated distribution parameters. **kwds : named arguments with keys - i : scalar integer - defining which distribution parameter to profile, i.e. which - parameter to keep fixed (default index to first non-fixed parameter) + i : scalar integer + defining which distribution parameter to profile, i.e. which + parameter to keep fixed (default first non-fixed parameter) pmin, pmax : real scalars - Interval for either the parameter, phat(i), prb, or x, used in the - optimization of the profile function (default is based on the - 100*(1-alpha)% confidence interval computed using the delta method.) + Interval for either the parameter, phat(i), prb, or x, used in the + optimization of the profile function (default is based on the + 100*(1-alpha)% confidence interval computed with the delta method.) N : scalar integer Max number of points used in Lp (default 100) x : real scalar Quantile (return value) (default None) logSF : real scalar log survival probability,i.e., SF = Prob(X>x;phat) (default None) - link : function connecting the quantile (x) and the survival probability - (SF) with the fixed distribution parameter, i.e.: - self.par[i] = link(x,logSF,self.par,i), where logSF = log(Prob(X>x;phat)). + link : function connecting the x-quantile and the survival probability + (SF) with the fixed distribution parameter, i.e.: + self.par[i] = link(x,logSF,self.par,i), where + logSF = log(Prob(X>x;phat)). This means that if: 1) x is not None then x is profiled 2) logSF is not None then logSF is profiled - 3) x and logSF both are None then self.par[i] is profiled (default) - alpha : real scalar + 3) x and logSF are None then self.par[i] is profiled (default) + alpha : real scalar confidence coefficent (default 0.05) Returns ------- @@ -160,7 +181,7 @@ class Profile(object): Lp = max(log(f(phat|data,phat(i)))), or Lp = max(log(f(phat|data,phat(i),x,prb))) - + Member methods ------------- plot() : Plot profile function with 100(1-alpha)% confidence interval @@ -182,16 +203,16 @@ class Profile(object): Examples -------- - # MLE + # MLE >>> import wafo.stats as ws >>> R = ws.weibull_min.rvs(1,size=100); >>> phat = FitDistribution(ws.weibull_min, R, 1, scale=1, floc=0.0) - + # Better CI for phat.par[i=0] >>> Lp = Profile(phat, i=0) >>> Lp.plot() >>> phat_ci = Lp.get_bounds(alpha=0.1) - + >>> SF = 1./990 >>> x = phat.isf(SF) @@ -207,7 +228,7 @@ class Profile(object): ''' def __init__(self, fit_dist, **kwds): - + try: i0 = (1 - numpy.isfinite(fit_dist.par_fix)).argmax() except: @@ -218,9 +239,12 @@ class Profile(object): self.title = 'Profile log' self.xlabel = '' self.ylabel = '' - self.i_fixed, self.N, self.alpha, self.pmin, self.pmax, self.x, self.logSF, self.link = map(kwds.get, - ['i', 'N', 'alpha', 'pmin', 'pmax', 'x', 'logSF', 'link'], - [i0, 100, 0.05, None, None, None, None, None]) + (self.i_fixed, self.N, self.alpha, self.pmin, self.pmax, self.x, + self.logSF, self.link) = map( + kwds.get, + ['i', 'N', 'alpha', 'pmin', + 'pmax', 'x', 'logSF', 'link'], + [i0, 100, 0.05, None, None, None, None, None]) self.ylabel = '%g%s CI' % (100 * (1.0 - self.alpha), '%') if fit_dist.method.startswith('ml'): @@ -230,8 +254,9 @@ class Profile(object): self.title = self.title + ' product spacing' Lmax = fit_dist.LPSmax else: - raise ValueError("PROFILE is only valid for ML- or MPS- estimators") - + raise ValueError( + "PROFILE is only valid for ML- or MPS- estimators") + if fit_dist.par_fix is None: isnotfixed = np.ones(fit_dist.par.shape, dtype=bool) else: @@ -242,21 +267,23 @@ class Profile(object): self.i_fixed = atleast_1d(self.i_fixed) if 1 - isnotfixed[self.i_fixed]: - raise ValueError("Index i must be equal to an index to one of the free parameters.") + raise ValueError( + "Index i must be equal to an index to one of the free " + + "parameters.") isfree = isnotfixed isfree[self.i_fixed] = False self.i_free = nonzero(isfree) self.Lmax = Lmax - self.alpha_Lrange = 0.5 * chi2isf(self.alpha, 1) + self.alpha_Lrange = 0.5 * chi2isf(self.alpha, 1) self.alpha_cross_level = Lmax - self.alpha_Lrange - #lowLevel = self.alpha_cross_level - self.alpha_Lrange / 7.0 + # lowLevel = self.alpha_cross_level - self.alpha_Lrange / 7.0 phatv = fit_dist.par.copy() self._par = phatv.copy() - - ## Set up variable to profile and _local_link function + + # Set up variable to profile and _local_link function self.profile_x = not self.x == None self.profile_logSF = not (self.logSF == None or self.profile_x) self.profile_par = not (self.profile_x or self.profile_logSF) @@ -264,59 +291,65 @@ class Profile(object): if self.link == None: self.link = self.fit_dist.dist.link if self.profile_par: - self._local_link = lambda fix_par, par : fix_par + self._local_link = lambda fix_par, par: fix_par self.xlabel = 'phat(%d)' % self.i_fixed p_opt = self._par[self.i_fixed] elif self.profile_x: self.logSF = log(fit_dist.sf(self.x)) - self._local_link = lambda fix_par, par : self.link(fix_par, self.logSF, par, self.i_fixed) + self._local_link = lambda fix_par, par: self.link( + fix_par, self.logSF, par, self.i_fixed) self.xlabel = 'x' p_opt = self.x elif self.profile_logSF: p_opt = self.logSF self.x = fit_dist.isf(exp(p_opt)) - self._local_link = lambda fix_par, par : self.link(self.x, fix_par, par, self.i_fixed) + self._local_link = lambda fix_par, par: self.link( + self.x, fix_par, par, self.i_fixed) self.xlabel = 'log(SF)' else: - raise ValueError("You must supply a non-empty quantile (x) or probability (logSF) in order to profile it!") + raise ValueError( + "You must supply a non-empty quantile (x) or probability " + + "(logSF) in order to profile it!") self.xlabel = self.xlabel + ' (' + fit_dist.dist.name + ')' - - phatfree = phatv[self.i_free].copy() - self._set_profile(phatfree, p_opt) + phatfree = phatv[self.i_free].copy() + self._set_profile(phatfree, p_opt) def _correct_Lmax(self, Lmax): - if Lmax > self.Lmax: #foundNewphat = True - warnings.warn('The fitted parameters does not provide the optimum fit. Something wrong with fit') + if Lmax > self.Lmax: # foundNewphat = True + warnings.warn( + 'The fitted parameters does not provide the optimum fit. ' + + 'Something wrong with fit') dL = self.Lmax - Lmax self.alpha_cross_level -= dL self.Lmax = Lmax def _profile_optimum(self, phatfree0, p_opt): - phatfree = optimize.fmin(self._profile_fun, phatfree0, args=(p_opt, ), disp=0) + phatfree = optimize.fmin( + self._profile_fun, phatfree0, args=(p_opt,), disp=0) Lmax = -self._profile_fun(phatfree, p_opt) self._correct_Lmax(Lmax) return Lmax, phatfree def _set_profile(self, phatfree0, p_opt): pvec = self._get_pvec(phatfree0, p_opt) - + self.data = numpy.ones_like(pvec) * nan k1 = (pvec >= p_opt).argmax() - - for size, step in ((-1,-1), (pvec.size, 1)): + + for size, step in ((-1, -1), (pvec.size, 1)): phatfree = phatfree0.copy() for ix in xrange(k1, size, step): Lmax, phatfree = self._profile_optimum(phatfree, pvec[ix]) self.data[ix] = Lmax - if self.data[ix] < self.alpha_cross_level: + if self.data[ix] < self.alpha_cross_level: break np.putmask(pvec, np.isnan(self.data), nan) self.args = pvec - + self._prettify_profile() - + def _prettify_profile(self): pvec = self.args ix = nonzero(numpy.isfinite(pvec)) @@ -331,7 +364,7 @@ class Profile(object): t0 = ecross(self.args, self.data, ind1, cl) self.data.put(ind, cl) self.args.put(ind, t0) - + def _get_variance(self): if self.profile_par: pvar = self.fit_dist.par_cov[self.i_fixed, :][:, self.i_fixed] @@ -348,7 +381,7 @@ class Profile(object): pcov = self.fit_dist.par_cov[i_notfixed, :][:, i_notfixed] pvar = sum(numpy.dot(drl, pcov) * drl) return pvar - + def _get_pvec(self, phatfree0, p_opt): ''' return proper interval for the variable to profile ''' @@ -357,35 +390,39 @@ class Profile(object): if self.pmin == None or self.pmax == None: pvar = self._get_variance() - - if pvar<=1e-5 or numpy.isnan(pvar): - pvar = max(abs(p_opt)*0.5, 0.5) - - p_crit = -norm_ppf(self.alpha / 2.0) * sqrt(numpy.ravel(pvar)) * 1.5 + + if pvar <= 1e-5 or numpy.isnan(pvar): + pvar = max(abs(p_opt) * 0.5, 0.5) + + p_crit = (-norm_ppf(self.alpha / 2.0) * + sqrt(numpy.ravel(pvar)) * 1.5) if self.pmin == None: - self.pmin = self._search_pmin(phatfree0, p_opt - 5.0 * p_crit, p_opt) - p_crit_low = (p_opt-self.pmin)/5 - + self.pmin = self._search_pmin(phatfree0, + p_opt - 5.0 * p_crit, p_opt) + p_crit_low = (p_opt - self.pmin) / 5 + if self.pmax == None: - self.pmax = self._search_pmax(phatfree0,p_opt + 5.0 * p_crit, p_opt) - p_crit_up = (self.pmax-p_opt)/5 - + self.pmax = self._search_pmax(phatfree0, + p_opt + 5.0 * p_crit, p_opt) + p_crit_up = (self.pmax - p_opt) / 5 + N4 = numpy.floor(self.N / 4.0) pvec1 = linspace(self.pmin, p_opt - p_crit_low, N4 + 1) - pvec2 = linspace(p_opt - p_crit_low, p_opt + p_crit_up, self.N - 2 * N4) + pvec2 = linspace( + p_opt - p_crit_low, p_opt + p_crit_up, self.N - 2 * N4) pvec3 = linspace(p_opt + p_crit_up, self.pmax, N4 + 1) pvec = numpy.unique(numpy.hstack((pvec1, p_opt, pvec2, pvec3))) else: pvec = linspace(self.pmin, self.pmax, self.N) return pvec - + def _search_pmin(self, phatfree0, p_min0, p_opt): phatfree = phatfree0.copy() - - dp = p_opt-p_min0 - if dp<1e-2: + + dp = p_opt - p_min0 + if dp < 1e-2: dp = 0.1 p_min_opt = p_min0 Lmax, phatfree = self._profile_optimum(phatfree, p_opt) @@ -394,21 +431,21 @@ class Profile(object): Lmax, phatfree = self._profile_optimum(phatfree, p_min) if np.isnan(Lmax): dp *= 0.33 - elif Lmax < self.alpha_cross_level-self.alpha_Lrange*2: + elif Lmax < self.alpha_cross_level - self.alpha_Lrange * 2: p_min_opt = p_min dp *= 0.33 - elif Lmax self.data[ind + 1] @@ -486,68 +525,74 @@ class Profile(object): def plot(self): ''' Plot profile function with 100(1-alpha)% CI ''' - plotbackend.plot(self.args, self.data, - self.args[[0, -1]], [self.Lmax, ]*2, 'r', - self.args[[0, -1]], [self.alpha_cross_level, ]*2, 'r') + plotbackend.plot( + self.args, self.data, + self.args[[0, -1]], [self.Lmax, ] * 2, 'r', + self.args[[0, -1]], [self.alpha_cross_level, ] * 2, 'r') plotbackend.title(self.title) plotbackend.ylabel(self.ylabel) plotbackend.xlabel(self.xlabel) - + + def _discretize_adaptive(fun, a, b, tol=0.005, n=5): ''' Automatic discretization of function, adaptive gridding. ''' tiny = floatinfo.tiny - n += (np.mod(n, 2) == 0) # make sure n is odd + n += (np.mod(n, 2) == 0) # make sure n is odd x = np.linspace(a, b, n) fx = fun(x) - + n2 = (n - 1) / 2 erri = np.hstack((np.zeros((n2, 1)), np.ones((n2, 1)))).ravel() err = erri.max() err0 = np.inf - #while (err != err0 and err > tol and n < nmax): + # while (err != err0 and err > tol and n < nmax): for j in range(50): if err != err0 and np.any(erri > tol): err0 = err # find top errors - + I, = np.where(erri > tol) - # double the sample rate in intervals with the most error - y = (np.vstack(((x[I] + x[I - 1]) / 2, (x[I + 1] + x[I]) / 2)).T).ravel() + # double the sample rate in intervals with the most error + y = (np.vstack(((x[I] + x[I - 1]) / 2, + (x[I + 1] + x[I]) / 2)).T).ravel() fy = fun(y) - + fy0 = np.interp(y, x, fx) erri = 0.5 * (abs((fy0 - fy) / (abs(fy0 + fy) + tiny))) - + err = erri.max() - + x = np.hstack((x, y)) - + I = x.argsort() x = x[I] erri = np.hstack((zeros(len(fx)), erri))[I] fx = np.hstack((fx, fy))[I] - + else: break else: warnings.warn('Recursion level limit reached j=%d' % j) - + return x, fx # class to fit given distribution to data + + class FitDistribution(rv_frozen): + ''' Return estimators to shape, location, and scale from data Starting points for the fit are given by input arguments. For any arguments not given starting points, dist._fitstart(data) is called to get the starting estimates. - + You can hold some parameters fixed to specific values by passing in keyword arguments f0..fn for shape paramters and floc, fscale for location and scale parameters. - + Parameters ---------- dist : scipy distribution object @@ -579,7 +624,7 @@ class FitDistribution(rv_frozen): plus args (for extra arguments to pass to the function to be optimized) and disp=0 to suppress output as keyword arguments. - + Return ------ phat : FitDistribution object @@ -592,32 +637,31 @@ class FitDistribution(rv_frozen): par_fix : fixed distribution parameters par_lower : lower (1-alpha)% confidence bound for the parameters par_upper : upper (1-alpha)% confidence bound for the parameters - - + Note ---- `data` is sorted using this function, so if `copydata`==False the data in your namespace will be sorted as well. - + Examples -------- Estimate distribution parameters for weibull_min distribution. >>> import wafo.stats as ws >>> R = ws.weibull_min.rvs(1,size=100); >>> phat = FitDistribution(ws.weibull_min, R, 1, scale=1, floc=0.0) - + #Plot various diagnostic plots to asses quality of fit. - >>> phat.plotfitsummary() - + >>> phat.plotfitsummary() + #phat.par holds the estimated parameters #phat.par_upper upper CI for parameters #phat.par_lower lower CI for parameters - + #Better CI for phat.par[0] >>> Lp = phat.profile(i=0) >>> Lp.plot() >>> p_ci = Lp.get_bounds(alpha=0.1) - + >>> SF = 1./990 >>> x = phat.isf(SF) @@ -625,12 +669,13 @@ class FitDistribution(rv_frozen): >>> Lx = phat.profile(i=0,x=x,link=phat.dist.link) >>> Lx.plot() >>> x_ci = Lx.get_bounds(alpha=0.2) - + # CI for logSF=log(SF) >>> Lsf = phat.profile(i=0, logSF=log(SF), link=phat.dist.link) >>> Lsf.plot() >>> sf_ci = Lsf.get_bounds(alpha=0.2) ''' + def __init__(self, dist, data, *args, **kwds): extradoc = ''' plotfitsummary() @@ -645,10 +690,10 @@ class FitDistribution(rv_frozen): Displays a residual quantile plot. plotresprb() Displays a residual probability plot. - + profile() Return Profile Log- likelihood or Product Spacing-function. - + Parameters ---------- x : array-like @@ -683,70 +728,80 @@ class FitDistribution(rv_frozen): self.__doc__ = rv_frozen.__doc__ + extradoc self.dist = dist numargs = dist.numargs - - self.method=self.alpha=self.par_fix=self.search=self.copydata=None + + self.method = self.alpha = self.par_fix = self.search = None + self.copydata = None m_variables = ['method', 'alpha', 'par_fix', 'search', 'copydata'] m_defaults = ['ml', 0.05, None, True, True] - for (name, val) in zip(m_variables,m_defaults): - setattr(self, name, kwds.get(name,val)) - - #self.method, self.alpha, self.par_fix, self.search, self.copydata = map(kwds.get, m_variables, m_defaults) + for (name, val) in zip(m_variables, m_defaults): + setattr(self, name, kwds.get(name, val)) + if self.method.lower()[:].startswith('mps'): self._fitfun = dist.nlogps else: self._fitfun = dist.nnlf - + self.data = ravel(data) if self.copydata: self.data = self.data.copy() self.data.sort() - + par, fixedn = self._fit(*args, **kwds) self.par = arr(par) - somefixed = len(fixedn)>0 + somefixed = len(fixedn) > 0 if somefixed: - self.par_fix = [nan,]*len(self.par) + self.par_fix = [nan, ] * len(self.par) for i in fixedn: self.par_fix[i] = self.par[i] - + self.i_notfixed = nonzero(1 - isfinite(self.par_fix)) self.i_fixed = nonzero(isfinite(self.par_fix)) numpar = numargs + 2 self.par_cov = zeros((numpar, numpar)) self._compute_cov() - + # Set confidence interval for parameters pvar = numpy.diag(self.par_cov) zcrit = -norm_ppf(self.alpha / 2.0) self.par_lower = self.par - zcrit * sqrt(pvar) self.par_upper = self.par + zcrit * sqrt(pvar) - + self.LLmax = -dist.nnlf(self.par, self.data) self.LPSmax = -dist.nlogps(self.par, self.data) self.pvalue = self._pvalue(self.par, self.data, unknown_numpar=numpar) - + + def __repr__(self): + params = ['alpha', 'method', 'LLmax', 'LPSmax', 'pvalue', + 'par', 'par_lower', 'par_upper', 'par_fix', 'par_cov'] + t = ['%s:\n' % self.__class__.__name__] + for par in params: + t.append('%s = %s\n' % (par, str(getattr(self, par)))) + return ''.join(t) + def _reduce_func(self, args, kwds): args = list(args) Nargs = len(args) fixedn = [] - index = range(Nargs) - names = ['f%d' % n for n in range(Nargs-2)] + ['floc', 'fscale'] + index = range(Nargs) + names = ['f%d' % n for n in range(Nargs - 2)] + ['floc', 'fscale'] x0 = args[:] for n, key in zip(index[::-1], names[::-1]): - if kwds.has_key(key): + if key in kwds: fixedn.append(n) args[n] = kwds[key] del x0[n] - + fitfun = self._fitfun - + if len(fixedn) == 0: func = fitfun restore = None else: if len(fixedn) == len(index): - raise ValueError, "All parameters fixed. There is nothing to optimize." + raise ValueError("All parameters fixed. " + + "There is nothing to optimize.") + def restore(args, theta): # Replace with theta for all numbers not in fixedn # This allows the non-fixed values to vary, but @@ -763,19 +818,19 @@ class FitDistribution(rv_frozen): return fitfun(newtheta, x) return x0, func, restore, args, fixedn - + def _fit(self, *args, **kwds): - + dist = self.dist data = self.data - + Narg = len(args) if Narg > dist.numargs: - raise ValueError, "Too many input arguments." - start = [None]*2 - if (Narg < dist.numargs) or not (kwds.has_key('loc') and - kwds.has_key('scale')): - start = dist._fitstart(data) # get distribution specific starting locations + raise ValueError("Too many input arguments.") + start = [None] * 2 + if (Narg < dist.numargs) or not ('loc' in kwds and 'scale' in kwds): + # get distribution specific starting locations + start = dist._fitstart(data) args += start[Narg:-2] loc = kwds.get('loc', start[-2]) scale = kwds.get('scale', start[-1]) @@ -784,36 +839,37 @@ class FitDistribution(rv_frozen): if self.search: optimizer = kwds.get('optimizer', optimize.fmin) # convert string to function in scipy.optimize - if not callable(optimizer) and isinstance(optimizer, (str, unicode)): + if (not callable(optimizer) and + isinstance(optimizer, (str, unicode))): if not optimizer.startswith('fmin_'): - optimizer = "fmin_"+optimizer - if optimizer == 'fmin_': + optimizer = "fmin_" + optimizer + if optimizer == 'fmin_': optimizer = 'fmin' try: optimizer = getattr(optimize, optimizer) except AttributeError: - raise ValueError, "%s is not a valid optimizer" % optimizer - - vals = optimizer(func,x0,args=(ravel(data),),disp=0) + raise ValueError("%s is not a valid optimizer" % optimizer) + + vals = optimizer(func, x0, args=(ravel(data),), disp=0) vals = tuple(vals) else: vals = tuple(x0) if restore is not None: vals = restore(args, vals) return vals, fixedn - + def _compute_cov(self): '''Compute covariance ''' somefixed = (self.par_fix != None) and any(isfinite(self.par_fix)) - #H1 = numpy.asmatrix(self.dist.hessian_nnlf(self.par, self.data)) + # H1 = numpy.asmatrix(self.dist.hessian_nnlf(self.par, self.data)) H = numpy.asmatrix(self.dist.hessian_nlogps(self.par, self.data)) self.H = H try: if somefixed: allfixed = all(isfinite(self.par_fix)) if allfixed: - self.par_cov[:,:]=0 + self.par_cov[:, :] = 0 else: pcov = -pinv2(H[self.i_notfixed, :][..., self.i_notfixed]) for row, ix in enumerate(list(self.i_notfixed)): @@ -822,7 +878,7 @@ class FitDistribution(rv_frozen): self.par_cov = -pinv2(H) except: self.par_cov[:, :] = nan - + def fitfun(self, phat): return self._fitfun(phat, self.data) @@ -834,41 +890,42 @@ class FitDistribution(rv_frozen): Parameters ---------- **kwds : named arguments with keys - i : scalar integer - defining which distribution parameter to profile, i.e. which - parameter to keep fixed (default index to first non-fixed parameter) + i : scalar integer + defining which distribution parameter to profile, i.e. which + parameter to keep fixed (default first non-fixed parameter) pmin, pmax : real scalars - Interval for either the parameter, phat(i), prb, or x, used in the - optimization of the profile function (default is based on the - 100*(1-alpha)% confidence interval computed using the delta method.) + Interval for either the parameter, phat(i), prb, or x, used in the + optimization of the profile function (default is based on the + 100*(1-alpha)% confidence interval computed with the delta method.) N : scalar integer Max number of points used in Lp (default 100) x : real scalar Quantile (return value) (default None) logSF : real scalar log survival probability,i.e., SF = Prob(X>x;phat) (default None) - link : function connecting the quantile (x) and the survival probability - (SF) with the fixed distribution parameter, i.e.: - self.par[i] = link(x,logSF,self.par,i), where logSF = log(Prob(X>x;phat)). + link : function connecting the x-quantile and the survival probability + (SF) with the fixed distribution parameter, i.e.: + self.par[i] = link(x,logSF,self.par,i), where + logSF = log(Prob(X>x;phat)). This means that if: 1) x is not None then x is profiled 2) logSF is not None then logSF is profiled - 3) x and logSF both are None then self.par[i] is profiled (default) - alpha : real scalar + 3) x and logSF are None then self.par[i] is profiled (default) + alpha : real scalar confidence coefficent (default 0.05) Returns ------- Lp : Profile log-likelihood function with parameters phat given - the data, phat(i), probability (prb) and quantile (x) (if given), i.e., + the data, phat(i), probability (prb) and quantile (x), i.e., Lp = max(log(f(phat|data,phat(i)))), or Lp = max(log(f(phat|data,phat(i),x,prb))) - + Member methods ------------- plot() : Plot profile function with 100(1-alpha)% confidence interval get_bounds() : Return 100(1-alpha)% confidence interval - + Member variables ---------------- fit_dist : FitDistribution data object. @@ -877,32 +934,33 @@ class FitDistribution(rv_frozen): alpha : confidence coefficient Lmax : Maximum value of profile function alpha_cross_level : - - PROFILE is a utility function for making inferences either on a particular - component of the vector phat or the quantile, x, or the probability, SF. - This is usually more accurate than using the delta method assuming - asymptotic normality of the ML estimator or the MPS estimator. - + + PROFILE is a utility function for making inferences either on a + particular component of the vector phat or the quantile, x, or the + probability, SF. This is usually more accurate than using the delta + method assuming asymptotic normality of the ML estimator or the MPS + estimator. + Examples -------- - # MLE + # MLE >>> import wafo.stats as ws >>> R = ws.weibull_min.rvs(1,size=100); >>> phat = FitDistribution(ws.weibull_min, R, 1, scale=1, floc=0.0) - + # Better CI for phat.par[i=0] >>> Lp = Profile(phat, i=0) >>> Lp.plot() >>> phat_ci = Lp.get_bounds(alpha=0.1) - + >>> SF = 1./990 >>> x = phat.isf(SF) - + # CI for x >>> Lx = phat.profile(i=0, x=x, link=phat.dist.link) >>> Lx.plot() >>> x_ci = Lx.get_bounds(alpha=0.2) - + # CI for logSF=log(SF) >>> Lsf = phat.profile(i=0, logSF=log(SF), link=phat.dist.link) >>> Lsf.plot() @@ -917,15 +975,16 @@ class FitDistribution(rv_frozen): def plotfitsummary(self): ''' Plot various diagnostic plots to asses the quality of the fit. - PLOTFITSUMMARY displays probability plot, density plot, residual quantile - plot and residual probability plot. + PLOTFITSUMMARY displays probability plot, density plot, residual + quantile plot and residual probability plot. The purpose of these plots is to graphically assess whether the data - could come from the fitted distribution. If so the empirical- CDF and PDF - should follow the model and the residual plots will be linear. Other - distribution types will introduce curvature in the residual plots. + could come from the fitted distribution. If so the empirical- CDF and + PDF should follow the model and the residual plots will be linear. + Other distribution types will introduce curvature in the residual + plots. ''' plotbackend.subplot(2, 2, 1) - #self.plotecdf() + # self.plotecdf() self.plotesf() plotbackend.subplot(2, 2, 2) self.plotepdf() @@ -933,19 +992,19 @@ class FitDistribution(rv_frozen): self.plotresq() plotbackend.subplot(2, 2, 4) self.plotresprb() - + fixstr = '' if not self.par_fix == None: numfix = len(self.i_fixed) if numfix > 0: - format0 = ', '.join(['%d']*numfix) - format1 = ', '.join(['%g']*numfix) + format0 = ', '.join(['%d'] * numfix) + format1 = ', '.join(['%g'] * numfix) phatistr = format0 % tuple(self.i_fixed) phatvstr = format1 % tuple(self.par[self.i_fixed]) fixstr = 'Fixed: phat[%s] = %s ' % (phatistr, phatvstr) - - infostr = 'Fit method: %s, Fit p-value: %2.2f %s' % (self.method, self.pvalue, fixstr) + infostr = 'Fit method: %s, Fit p-value: %2.2f %s' % ( + self.method, self.pvalue, fixstr) try: plotbackend.figtext(0.05, 0.01, infostr) except: @@ -961,9 +1020,10 @@ class FitDistribution(rv_frozen): ''' n = len(self.data) SF = (arange(n, 0, -1)) / n - plotbackend.semilogy(self.data, SF, symb2, self.data, self.sf(self.data), symb1) - #plotbackend.plot(self.data,SF,'b.',self.data,self.sf(self.data),'r-') - plotbackend.xlabel('x'); + plotbackend.semilogy( + self.data, SF, symb2, self.data, self.sf(self.data), symb1) + # plotbackend.plot(self.data,SF,'b.',self.data,self.sf(self.data),'r-') + plotbackend.xlabel('x') plotbackend.ylabel('F(x) (%s)' % self.dist.name) plotbackend.title('Empirical SF plot') @@ -977,59 +1037,64 @@ class FitDistribution(rv_frozen): ''' n = len(self.data) F = (arange(1, n + 1)) / n - plotbackend.plot(self.data, F, symb2, self.data, self.cdf(self.data), symb1) - plotbackend.xlabel('x'); + plotbackend.plot(self.data, F, symb2, + self.data, self.cdf(self.data), symb1) + plotbackend.xlabel('x') plotbackend.ylabel('F(x) (%s)' % self.dist.name) plotbackend.title('Empirical CDF plot') - def plotepdf(self, symb1='r-', symb2='b-'): - '''Plot Empirical and fitted Probability Density Function - - The purpose of the plot is to graphically assess whether - the data could come from the fitted distribution. - If so the histogram should resemble the model density. - Other distribution types will introduce deviations in the plot. - ''' - odd = False + def _get_grid(self, odd=False): x = np.atleast_1d(self.data) n = np.ceil(4 * np.sqrt(np.sqrt(len(x)))) - mn = x.min() mx = x.max() d = (mx - mn) / n * 2 - e = np.floor(np.log(d) / np.log(10)); + e = np.floor(np.log(d) / np.log(10)) m = np.floor(d / 10 ** e) if m > 5: m = 5 elif m > 2: m = 2 - d = m * 10 ** e mn = (np.floor(mn / d) - 1) * d - odd * d / 2 mx = (np.ceil(mx / d) + 1) * d + odd * d / 2 limits = np.arange(mn, mx, d) - bin, limits = np.histogram(self.data, bins=limits, normed=True) #, new=True) @ReservedAssignment - limits.shape = (-1, 1) - xx = limits.repeat(3, axis=1) - xx.shape = (-1,) - xx = xx[1:-1] - bin.shape = (-1, 1) - yy = bin.repeat(3, axis=1) - #yy[0,0] = 0.0 # pdf - yy[:, 0] = 0.0 # histogram + return limits + + def _staircase(self, x, y): + xx = x.reshape(-1, 1).repeat(3, axis=1).ravel()[1:-1] + yy = y.reshape(-1, 1).repeat(3, axis=1) + # yy[0,0] = 0.0 # pdf + yy[:, 0] = 0.0 # histogram yy.shape = (-1,) yy = numpy.hstack((yy, 0.0)) - ymax = yy.max() - #plotbackend.hist(self.data,normed=True,fill=False) - plotbackend.plot(self.data, self.pdf(self.data), symb1, xx, yy, symb2) + return xx, yy + + def _get_empirical_pdf(self): + limits = self._get_grid() + pdf, x = np.histogram(self.data, bins=limits, normed=True) + return self._staircase(x, pdf) + + def plotepdf(self, symb1='r-', symb2='b-'): + '''Plot Empirical and fitted Probability Density Function + + The purpose of the plot is to graphically assess whether + the data could come from the fitted distribution. + If so the histogram should resemble the model density. + Other distribution types will introduce deviations in the plot. + ''' + x, pdf = self._get_empirical_pdf() + ymax = pdf.max() + # plotbackend.hist(self.data,normed=True,fill=False) + plotbackend.plot(self.data, self.pdf(self.data), symb1, + x, pdf, symb2) ax = list(plotbackend.axis()) - ax[3] = min(ymax*1.3, ax[3]) + ax[3] = min(ymax * 1.3, ax[3]) plotbackend.axis(ax) - plotbackend.xlabel('x'); + plotbackend.xlabel('x') plotbackend.ylabel('f(x) (%s)' % self.dist.name) plotbackend.title('Density plot') - def plotresq(self, symb1='r-', symb2='b.'): '''PLOTRESQ displays a residual quantile plot. @@ -1045,33 +1110,34 @@ class FitDistribution(rv_frozen): plotbackend.plot(self.data, y, symb2, y1, y1, symb1) plotbackend.xlabel('Empirical') plotbackend.ylabel('Model (%s)' % self.dist.name) - plotbackend.title('Residual Quantile Plot'); + plotbackend.title('Residual Quantile Plot') plotbackend.axis('tight') plotbackend.axis('equal') - def plotresprb(self, symb1='r-', symb2='b.'): ''' PLOTRESPRB displays a residual probability plot. The purpose of the plot is to graphically assess whether the data could come from the fitted distribution. If so the - plot will be linear. Other distribution types will introduce curvature in the plot. + plot will be linear. Other distribution types will introduce curvature + in the plot. ''' - n = len(self.data); - #ecdf = (0.5:n-0.5)/n; + n = len(self.data) + # ecdf = (0.5:n-0.5)/n; ecdf = arange(1, n + 1) / (n + 1) mcdf = self.cdf(self.data) p1 = [0, 1] - plotbackend.plot(ecdf, mcdf, symb2, p1, p1, symb1) + plotbackend.plot(ecdf, mcdf, symb2, + p1, p1, symb1) plotbackend.xlabel('Empirical') plotbackend.ylabel('Model (%s)' % self.dist.name) - plotbackend.title('Residual Probability Plot'); + plotbackend.title('Residual Probability Plot') plotbackend.axis('equal') plotbackend.axis([0, 1, 0, 1]) - def _pvalue(self, theta, x, unknown_numpar=None): - ''' Return the P-value for the fit using Moran's negative log Product Spacings statistic + ''' Return P-value for the fit using Moran's negative log Product + Spacings statistic where theta are the parameters (including loc and scale) @@ -1080,7 +1146,9 @@ class FitDistribution(rv_frozen): dx = numpy.diff(x, axis=0) tie = (dx == 0) if any(tie): - warnings.warn('P-value is on the conservative side (i.e. too large) due to ties in the data!') + warnings.warn( + 'P-value is on the conservative side (i.e. too large) due to' + + ' ties in the data!') T = self.dist.nlogps(theta, x) @@ -1096,54 +1164,51 @@ class FitDistribution(rv_frozen): v = (np1) * (pi ** 2. / 6.0 - 1.0) - 0.5 - 1.0 / (6. * (np1)) C1 = m - sqrt(0.5 * n * v) C2 = sqrt(v / (2.0 * n)) - Tn = (T + 0.5 * k * isParUnKnown - C1) / C2 # chi2 with n degrees of freedom - pvalue = chi2sf(Tn, n) #_WAFODIST.chi2.sf(Tn, n) + # chi2 with n degrees of freedom + Tn = (T + 0.5 * k * isParUnKnown - C1) / C2 + pvalue = chi2sf(Tn, n) # _WAFODIST.chi2.sf(Tn, n) return pvalue - - def test_doctstrings(): import doctest doctest.testmod() - + + def test1(): import wafo.stats as ws dist = ws.weibull_min - dist = ws.bradford - R = dist.rvs(0.3,size=1000); + #dist = ws.bradford + R = dist.rvs(0.3, size=1000) phat = FitDistribution(dist, R, method='ml') - -# # Better CI for phat.par[i=0] - Lp1 = Profile(phat, i=0) #@UnusedVariable + +# Better CI for phat.par[i=0] + Lp1 = Profile(phat, i=0) # @UnusedVariable # Lp2 = Profile(phat, i=2) # SF = 1./990 # x = phat.isf(SF) # -# # CI for x +# CI for x # Lx = Profile(phat, i=0,x=x,link=phat.dist.link) # Lx.plot() # x_ci = Lx.get_bounds(alpha=0.2) -# -# # CI for logSF=log(SF) +# +# CI for logSF=log(SF) # Lsf = phat.profile(i=0, logSF=log(SF), link=phat.dist.link) # Lsf.plot() # sf_ci = Lsf.get_bounds(alpha=0.2) # pass - - - - - + + # _WAFODIST = ppimport('wafo.stats.distributions') -# #nbinom(10, 0.75).rvs(3) +# nbinom(10, 0.75).rvs(3) # import matplotlib # matplotlib.interactive(True) # t = _WAFODIST.bernoulli(0.75).rvs(3) # x = np.r_[5, 10] # npr = np.r_[9, 9] # t2 = _WAFODIST.bd0(x, npr) -# #Examples MLE and better CI for phat.par[0] +# Examples MLE and better CI for phat.par[0] # R = _WAFODIST.weibull_min.rvs(1, size=100); # phat = _WAFODIST.weibull_min.fit(R, 1, 1, par_fix=[nan, 0, nan]) # Lp = phat.profile(i=0) @@ -1152,18 +1217,18 @@ def test1(): # R = 1. / 990 # x = phat.isf(R) # -# # CI for x +# CI for x # Lx = phat.profile(i=0, x=x) # Lx.plot() # Lx.get_bounds(alpha=0.2) # -# # CI for logSF=log(SF) +# CI for logSF=log(SF) # Lpr = phat.profile(i=0, logSF=log(R), link=phat.dist.link) # Lpr.plot() # Lpr.get_bounds(alpha=0.075) # # _WAFODIST.dlaplace.stats(0.8, loc=0) -## pass +# pass # t = _WAFODIST.planck(0.51000000000000001) # t.ppf(0.5) # t = _WAFODIST.zipf(2) @@ -1172,15 +1237,14 @@ def test1(): # _WAFODIST.rice.rvs(1) # x = plb.linspace(-5, 5) # y = _WAFODIST.genpareto.cdf(x, 0) -# #plb.plot(x,y) -# #plb.show() +# plb.plot(x,y) +# plb.show() # # # on = ones((2, 3)) # r = _WAFODIST.genpareto.rvs(0, size=100) # pht = _WAFODIST.genpareto.fit(r, 1, par_fix=[0, 0, nan]) # lp = pht.profile() - if __name__ == '__main__': test1() - #test_doctstrings() + # test_doctstrings() diff --git a/pywafo/src/wafo/stats/kde.py b/pywafo/src/wafo/stats/kde.py new file mode 100644 index 0000000..74a7b0d --- /dev/null +++ b/pywafo/src/wafo/stats/kde.py @@ -0,0 +1,513 @@ +#------------------------------------------------------------------------------- +# +# Define classes for (uni/multi)-variate kernel density estimation. +# +# Currently, only Gaussian kernels are implemented. +# +# Written by: Robert Kern +# +# Date: 2004-08-09 +# +# Modified: 2005-02-10 by Robert Kern. +# Contributed to Scipy +# 2005-10-07 by Robert Kern. +# Some fixes to match the new scipy_core +# +# Copyright 2004-2005 by Enthought, Inc. +# +#------------------------------------------------------------------------------- + +from __future__ import division, print_function, absolute_import + +# Standard library imports. +import warnings + +# Scipy imports. +from scipy.lib.six import callable, string_types +from scipy import linalg, special + +from numpy import atleast_2d, reshape, zeros, newaxis, dot, exp, pi, sqrt, \ + ravel, power, atleast_1d, squeeze, sum, transpose +import numpy as np +from numpy.random import randint, multivariate_normal + +# Local imports. +from . import mvn + + +__all__ = ['gaussian_kde'] + + +class gaussian_kde(object): + """Representation of a kernel-density estimate using Gaussian kernels. + + Kernel density estimation is a way to estimate the probability density + function (PDF) of a random variable in a non-parametric way. + `gaussian_kde` works for both uni-variate and multi-variate data. It + includes automatic bandwidth determination. The estimation works best for + a unimodal distribution; bimodal or multi-modal distributions tend to be + oversmoothed. + + Parameters + ---------- + dataset : array_like + Datapoints to estimate from. In case of univariate data this is a 1-D + array, otherwise a 2-D array with shape (# of dims, # of data). + bw_method : str, scalar or callable, optional + The method used to calculate the estimator bandwidth. This can be + 'scott', 'silverman', a scalar constant or a callable. If a scalar, + this will be used directly as `kde.factor`. If a callable, it should + take a `gaussian_kde` instance as only parameter and return a scalar. + If None (default), 'scott' is used. See Notes for more details. + + Attributes + ---------- + dataset : ndarray + The dataset with which `gaussian_kde` was initialized. + d : int + Number of dimensions. + n : int + Number of datapoints. + factor : float + The bandwidth factor, obtained from `kde.covariance_factor`, with which + the covariance matrix is multiplied. + covariance : ndarray + The covariance matrix of `dataset`, scaled by the calculated bandwidth + (`kde.factor`). + inv_cov : ndarray + The inverse of `covariance`. + + Methods + ------- + kde.evaluate(points) : ndarray + Evaluate the estimated pdf on a provided set of points. + kde(points) : ndarray + Same as kde.evaluate(points) + kde.integrate_gaussian(mean, cov) : float + Multiply pdf with a specified Gaussian and integrate over the whole + domain. + kde.integrate_box_1d(low, high) : float + Integrate pdf (1D only) between two bounds. + kde.integrate_box(low_bounds, high_bounds) : float + Integrate pdf over a rectangular space between low_bounds and + high_bounds. + kde.integrate_kde(other_kde) : float + Integrate two kernel density estimates multiplied together. + kde.resample(size=None) : ndarray + Randomly sample a dataset from the estimated pdf. + kde.set_bandwidth(bw_method='scott') : None + Computes the bandwidth, i.e. the coefficient that multiplies the data + covariance matrix to obtain the kernel covariance matrix. + .. versionadded:: 0.11.0 + kde.covariance_factor : float + Computes the coefficient (`kde.factor`) that multiplies the data + covariance matrix to obtain the kernel covariance matrix. + The default is `scotts_factor`. A subclass can overwrite this method + to provide a different method, or set it through a call to + `kde.set_bandwidth`. + + + Notes + ----- + Bandwidth selection strongly influences the estimate obtained from the KDE + (much more so than the actual shape of the kernel). Bandwidth selection + can be done by a "rule of thumb", by cross-validation, by "plug-in + methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde` + uses a rule of thumb, the default is Scott's Rule. + + Scott's Rule [1]_, implemented as `scotts_factor`, is:: + + n**(-1./(d+4)), + + with ``n`` the number of data points and ``d`` the number of dimensions. + Silverman's Rule [2]_, implemented as `silverman_factor`, is:: + + n * (d + 2) / 4.)**(-1. / (d + 4)). + + Good general descriptions of kernel density estimation can be found in [1]_ + and [2]_, the mathematics for this multi-dimensional implementation can be + found in [1]_. + + References + ---------- + .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and + Visualization", John Wiley & Sons, New York, Chicester, 1992. + .. [2] B.W. Silverman, "Density Estimation for Statistics and Data + Analysis", Vol. 26, Monographs on Statistics and Applied Probability, + Chapman and Hall, London, 1986. + .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A + Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993. + .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel + conditional density estimation", Computational Statistics & Data + Analysis, Vol. 36, pp. 279-298, 2001. + + Examples + -------- + Generate some random two-dimensional data: + + >>> from scipy import stats + >>> def measure(n): + >>> "Measurement model, return two coupled measurements." + >>> m1 = np.random.normal(size=n) + >>> m2 = np.random.normal(scale=0.5, size=n) + >>> return m1+m2, m1-m2 + + >>> m1, m2 = measure(2000) + >>> xmin = m1.min() + >>> xmax = m1.max() + >>> ymin = m2.min() + >>> ymax = m2.max() + + Perform a kernel density estimate on the data: + + >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] + >>> positions = np.vstack([X.ravel(), Y.ravel()]) + >>> values = np.vstack([m1, m2]) + >>> kernel = stats.gaussian_kde(values) + >>> Z = np.reshape(kernel(positions).T, X.shape) + + Plot the results: + + >>> import matplotlib.pyplot as plt + >>> fig = plt.figure() + >>> ax = fig.add_subplot(111) + >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r, + ... extent=[xmin, xmax, ymin, ymax]) + >>> ax.plot(m1, m2, 'k.', markersize=2) + >>> ax.set_xlim([xmin, xmax]) + >>> ax.set_ylim([ymin, ymax]) + >>> plt.show() + + """ + def __init__(self, dataset, bw_method=None): + self.dataset = atleast_2d(dataset) + if not self.dataset.size > 1: + raise ValueError("`dataset` input should have multiple elements.") + + self.d, self.n = self.dataset.shape + self.set_bandwidth(bw_method=bw_method) + + def evaluate(self, points): + """Evaluate the estimated pdf on a set of points. + + Parameters + ---------- + points : (# of dimensions, # of points)-array + Alternatively, a (# of dimensions,) vector can be passed in and + treated as a single point. + + Returns + ------- + values : (# of points,)-array + The values at each point. + + Raises + ------ + ValueError : if the dimensionality of the input points is different than + the dimensionality of the KDE. + + """ + points = atleast_2d(points) + + d, m = points.shape + if d != self.d: + if d == 1 and m == self.d: + # points was passed in as a row vector + points = reshape(points, (self.d, 1)) + m = 1 + else: + msg = "points have dimension %s, dataset has dimension %s" % (d, + self.d) + raise ValueError(msg) + + result = zeros((m,), dtype=np.float) + + if m >= self.n: + # there are more points than data, so loop over data + for i in range(self.n): + diff = self.dataset[:, i, newaxis] - points + tdiff = dot(self.inv_cov, diff) + energy = sum(diff*tdiff,axis=0) / 2.0 + result = result + exp(-energy) + else: + # loop over points + for i in range(m): + diff = self.dataset - points[:, i, newaxis] + tdiff = dot(self.inv_cov, diff) + energy = sum(diff * tdiff, axis=0) / 2.0 + result[i] = sum(exp(-energy), axis=0) + + result = result / self._norm_factor + + return result + + __call__ = evaluate + + def integrate_gaussian(self, mean, cov): + """ + Multiply estimated density by a multivariate Gaussian and integrate + over the whole space. + + Parameters + ---------- + mean : aray_like + A 1-D array, specifying the mean of the Gaussian. + cov : array_like + A 2-D array, specifying the covariance matrix of the Gaussian. + + Returns + ------- + result : scalar + The value of the integral. + + Raises + ------ + ValueError : + If the mean or covariance of the input Gaussian differs from + the KDE's dimensionality. + + """ + mean = atleast_1d(squeeze(mean)) + cov = atleast_2d(cov) + + if mean.shape != (self.d,): + raise ValueError("mean does not have dimension %s" % self.d) + if cov.shape != (self.d, self.d): + raise ValueError("covariance does not have dimension %s" % self.d) + + # make mean a column vector + mean = mean[:, newaxis] + + sum_cov = self.covariance + cov + + diff = self.dataset - mean + tdiff = dot(linalg.inv(sum_cov), diff) + + energies = sum(diff * tdiff, axis=0) / 2.0 + result = sum(exp(-energies), axis=0) / sqrt(linalg.det(2 * pi * + sum_cov)) / self.n + + return result + + def integrate_box_1d(self, low, high): + """ + Computes the integral of a 1D pdf between two bounds. + + Parameters + ---------- + low : scalar + Lower bound of integration. + high : scalar + Upper bound of integration. + + Returns + ------- + value : scalar + The result of the integral. + + Raises + ------ + ValueError + If the KDE is over more than one dimension. + + """ + if self.d != 1: + raise ValueError("integrate_box_1d() only handles 1D pdfs") + + stdev = ravel(sqrt(self.covariance))[0] + + normalized_low = ravel((low - self.dataset) / stdev) + normalized_high = ravel((high - self.dataset) / stdev) + + value = np.mean(special.ndtr(normalized_high) - + special.ndtr(normalized_low)) + return value + + def integrate_box(self, low_bounds, high_bounds, maxpts=None): + """Computes the integral of a pdf over a rectangular interval. + + Parameters + ---------- + low_bounds : array_like + A 1-D array containing the lower bounds of integration. + high_bounds : array_like + A 1-D array containing the upper bounds of integration. + maxpts : int, optional + The maximum number of points to use for integration. + + Returns + ------- + value : scalar + The result of the integral. + + """ + if maxpts is not None: + extra_kwds = {'maxpts': maxpts} + else: + extra_kwds = {} + + value, inform = mvn.mvnun(low_bounds, high_bounds, self.dataset, + self.covariance, **extra_kwds) + if inform: + msg = ('An integral in mvn.mvnun requires more points than %s' % + (self.d * 1000)) + warnings.warn(msg) + + return value + + def integrate_kde(self, other): + """ + Computes the integral of the product of this kernel density estimate + with another. + + Parameters + ---------- + other : gaussian_kde instance + The other kde. + + Returns + ------- + value : scalar + The result of the integral. + + Raises + ------ + ValueError + If the KDEs have different dimensionality. + + """ + if other.d != self.d: + raise ValueError("KDEs are not the same dimensionality") + + # we want to iterate over the smallest number of points + if other.n < self.n: + small = other + large = self + else: + small = self + large = other + + sum_cov = small.covariance + large.covariance + result = 0.0 + for i in range(small.n): + mean = small.dataset[:, i, newaxis] + diff = large.dataset - mean + tdiff = dot(linalg.inv(sum_cov), diff) + + energies = sum(diff * tdiff, axis=0) / 2.0 + result += sum(exp(-energies), axis=0) + + result /= sqrt(linalg.det(2 * pi * sum_cov)) * large.n * small.n + + return result + + def resample(self, size=None): + """ + Randomly sample a dataset from the estimated pdf. + + Parameters + ---------- + size : int, optional + The number of samples to draw. If not provided, then the size is + the same as the underlying dataset. + + Returns + ------- + resample : (self.d, `size`) ndarray + The sampled dataset. + + """ + if size is None: + size = self.n + + norm = transpose(multivariate_normal(zeros((self.d,), float), + self.covariance, size=size)) + indices = randint(0, self.n, size=size) + means = self.dataset[:, indices] + + return means + norm + + def scotts_factor(self): + return power(self.n, -1./(self.d+4)) + + def silverman_factor(self): + return power(self.n*(self.d+2.0)/4.0, -1./(self.d+4)) + + # Default method to calculate bandwidth, can be overwritten by subclass + covariance_factor = scotts_factor + + def set_bandwidth(self, bw_method=None): + """Compute the estimator bandwidth with given method. + + The new bandwidth calculated after a call to `set_bandwidth` is used + for subsequent evaluations of the estimated density. + + Parameters + ---------- + bw_method : str, scalar or callable, optional + The method used to calculate the estimator bandwidth. This can be + 'scott', 'silverman', a scalar constant or a callable. If a + scalar, this will be used directly as `kde.factor`. If a callable, + it should take a `gaussian_kde` instance as only parameter and + return a scalar. If None (default), nothing happens; the current + `kde.covariance_factor` method is kept. + + Notes + ----- + .. versionadded:: 0.11 + + Examples + -------- + >>> x1 = np.array([-7, -5, 1, 4, 5.]) + >>> kde = stats.gaussian_kde(x1) + >>> xs = np.linspace(-10, 10, num=50) + >>> y1 = kde(xs) + >>> kde.set_bandwidth(bw_method='silverman') + >>> y2 = kde(xs) + >>> kde.set_bandwidth(bw_method=kde.factor / 3.) + >>> y3 = kde(xs) + + >>> fig = plt.figure() + >>> ax = fig.add_subplot(111) + >>> ax.plot(x1, np.ones(x1.shape) / (4. * x1.size), 'bo', + ... label='Data points (rescaled)') + >>> ax.plot(xs, y1, label='Scott (default)') + >>> ax.plot(xs, y2, label='Silverman') + >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)') + >>> ax.legend() + >>> plt.show() + + """ + if bw_method is None: + pass + elif bw_method == 'scott': + self.covariance_factor = self.scotts_factor + elif bw_method == 'silverman': + self.covariance_factor = self.silverman_factor + elif np.isscalar(bw_method) and not isinstance(bw_method, string_types): + self._bw_method = 'use constant' + self.covariance_factor = lambda: bw_method + elif callable(bw_method): + self._bw_method = bw_method + self.covariance_factor = lambda: self._bw_method(self) + else: + msg = "`bw_method` should be 'scott', 'silverman', a scalar " \ + "or a callable." + raise ValueError(msg) + + self._compute_covariance() + + def _compute_covariance(self): + """Computes the covariance matrix for each Gaussian kernel using + covariance_factor(). + """ + self.factor = self.covariance_factor() + # Cache covariance and inverse covariance of the data + if not hasattr(self, '_data_inv_cov'): + self._data_covariance = atleast_2d(np.cov(self.dataset, rowvar=1, + bias=False)) + self._data_inv_cov = linalg.inv(self._data_covariance) + + self.covariance = self._data_covariance * self.factor**2 + self.inv_cov = self._data_inv_cov / self.factor**2 + self._norm_factor = sqrt(linalg.det(2*pi*self.covariance)) * self.n diff --git a/pywafo/src/wafo/stats/morestats.py b/pywafo/src/wafo/stats/morestats.py new file mode 100644 index 0000000..59c432f --- /dev/null +++ b/pywafo/src/wafo/stats/morestats.py @@ -0,0 +1,1943 @@ +# Author: Travis Oliphant, 2002 +# +# Further updates and enhancements by many SciPy developers. +# +from __future__ import division, print_function, absolute_import + +import math +import warnings + +import numpy as np +from numpy import (isscalar, r_, log, sum, around, unique, asarray, + zeros, arange, sort, amin, amax, any, atleast_1d, sqrt, ceil, + floor, array, poly1d, compress, not_equal, pi, exp, ravel, angle) +from numpy.testing.decorators import setastest + +from scipy.lib.six import string_types +from scipy import optimize +from scipy import special +from . import statlib +from . import stats +from .stats import find_repeats +from . import distributions +from ._distn_infrastructure import rv_generic + + +__all__ = ['mvsdist', + 'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot', + 'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot', + 'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test', + 'fligner', 'mood', 'wilcoxon', + 'pdf_fromgamma', 'circmean', 'circvar', 'circstd', + ] + + +def bayes_mvs(data, alpha=0.90): + """ + Bayesian confidence intervals for the mean, var, and std. + + Parameters + ---------- + data : array_like + Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`. + Requires 2 or more data points. + alpha : float, optional + Probability that the returned confidence interval contains + the true parameter. + + Returns + ------- + mean_cntr, var_cntr, std_cntr : tuple + The three results are for the mean, variance and standard deviation, + respectively. Each result is a tuple of the form:: + + (center, (lower, upper)) + + with `center` the mean of the conditional pdf of the value given the + data, and `(lower, upper)` a confidence interval, centered on the + median, containing the estimate to a probability `alpha`. + + Notes + ----- + Each tuple of mean, variance, and standard deviation estimates represent + the (center, (lower, upper)) with center the mean of the conditional pdf + of the value given the data and (lower, upper) is a confidence interval + centered on the median, containing the estimate to a probability + `alpha`. + + Converts data to 1-D and assumes all data has the same mean and variance. + Uses Jeffrey's prior for variance and std. + + Equivalent to tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat)) + + References + ---------- + T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and + standard-deviation from data", http://hdl.handle.net/1877/438, 2006. + + """ + res = mvsdist(data) + if alpha >= 1 or alpha <= 0: + raise ValueError("0 < alpha < 1 is required, but alpha=%s was given." % alpha) + return tuple((x.mean(), x.interval(alpha)) for x in res) + + +def mvsdist(data): + """ + 'Frozen' distributions for mean, variance, and standard deviation of data. + + Parameters + ---------- + data : array_like + Input array. Converted to 1-D using ravel. + Requires 2 or more data-points. + + Returns + ------- + mdist : "frozen" distribution object + Distribution object representing the mean of the data + vdist : "frozen" distribution object + Distribution object representing the variance of the data + sdist : "frozen" distribution object + Distribution object representing the standard deviation of the data + + Notes + ----- + The return values from bayes_mvs(data) is equivalent to + ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``. + + In other words, calling ``.mean()`` and ``.interval(0.90)`` + on the three distribution objects returned from this function will give + the same results that are returned from `bayes_mvs`. + + Examples + -------- + >>> from scipy.stats import mvsdist + >>> data = [6, 9, 12, 7, 8, 8, 13] + >>> mean, var, std = mvsdist(data) + + We now have frozen distribution objects "mean", "var" and "std" that we can + examine: + + >>> mean.mean() + 9.0 + >>> mean.interval(0.95) + (6.6120585482655692, 11.387941451734431) + >>> mean.std() + 1.1952286093343936 + + """ + x = ravel(data) + n = len(x) + if (n < 2): + raise ValueError("Need at least 2 data-points.") + xbar = x.mean() + C = x.var() + if (n > 1000): # gaussian approximations for large n + mdist = distributions.norm(loc=xbar, scale=math.sqrt(C/n)) + sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C/(2.*n))) + vdist = distributions.norm(loc=C, scale=math.sqrt(2.0/n)*C) + else: + nm1 = n-1 + fac = n*C/2. + val = nm1/2. + mdist = distributions.t(nm1,loc=xbar,scale=math.sqrt(C/nm1)) + sdist = distributions.gengamma(val,-2,scale=math.sqrt(fac)) + vdist = distributions.invgamma(val,scale=fac) + return mdist, vdist, sdist + + +def kstat(data,n=2): + """ + Return the nth k-statistic (1<=n<=4 so far). + + The nth k-statistic is the unique symmetric unbiased estimator of the nth + cumulant kappa_n. + + Parameters + ---------- + data : array_like + Input array. + n : int, {1, 2, 3, 4}, optional + Default is equal to 2. + + Returns + ------- + kstat : float + The nth k-statistic. + + See Also + -------- + kstatvar: Returns an unbiased estimator of the variance of the k-statistic. + + Notes + ----- + The cumulants are related to central moments but are specifically defined + using a power series expansion of the logarithm of the characteristic + function (which is the Fourier transform of the PDF). + In particular let phi(t) be the characteristic function, then:: + + ln phi(t) = > kappa_n (it)^n / n! (sum from n=0 to inf) + + The first few cumulants (kappa_n) in terms of central moments (mu_n) are:: + + kappa_1 = mu_1 + kappa_2 = mu_2 + kappa_3 = mu_3 + kappa_4 = mu_4 - 3*mu_2**2 + kappa_5 = mu_5 - 10*mu_2 * mu_3 + + References + ---------- + http://mathworld.wolfram.com/k-Statistic.html + + http://mathworld.wolfram.com/Cumulant.html + + """ + if n > 4 or n < 1: + raise ValueError("k-statistics only supported for 1<=n<=4") + n = int(n) + S = zeros(n+1,'d') + data = ravel(data) + N = len(data) + for k in range(1,n+1): + S[k] = sum(data**k,axis=0) + if n == 1: + return S[1]*1.0/N + elif n == 2: + return (N*S[2]-S[1]**2.0)/(N*(N-1.0)) + elif n == 3: + return (2*S[1]**3 - 3*N*S[1]*S[2]+N*N*S[3]) / (N*(N-1.0)*(N-2.0)) + elif n == 4: + return (-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 - + 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) / \ + (N*(N-1.0)*(N-2.0)*(N-3.0)) + else: + raise ValueError("Should not be here.") + + +def kstatvar(data,n=2): + """ + Returns an unbiased estimator of the variance of the k-statistic. + + See `kstat` for more details of the k-statistic. + + Parameters + ---------- + data : array_like + Input array. + n : int, {1, 2}, optional + Default is equal to 2. + + Returns + ------- + kstatvar : float + The nth k-statistic variance. + + See Also + -------- + kstat + + """ + data = ravel(data) + N = len(data) + if n == 1: + return kstat(data,n=2)*1.0/N + elif n == 2: + k2 = kstat(data,n=2) + k4 = kstat(data,n=4) + return (2*k2*k2*N + (N-1)*k4)/(N*(N+1)) + else: + raise ValueError("Only n=1 or n=2 supported.") + + +def _calc_uniform_order_statistic_medians(x): + """See Notes section of `probplot` for details.""" + N = len(x) + osm_uniform = np.zeros(N, dtype=np.float64) + osm_uniform[-1] = 0.5**(1.0 / N) + osm_uniform[0] = 1 - osm_uniform[-1] + i = np.arange(2, N) + osm_uniform[1:-1] = (i - 0.3175) / (N + 0.365) + return osm_uniform + + +def _parse_dist_kw(dist, enforce_subclass=True): + """Parse `dist` keyword. + + Parameters + ---------- + dist : str or stats.distributions instance. + Several functions take `dist` as a keyword, hence this utility + function. + enforce_subclass : bool, optional + If True (default), `dist` needs to be a + `_distn_infrastructure.rv_generic` instance. + It can sometimes be useful to set this keyword to False, if a function + wants to accept objects that just look somewhat like such an instance + (for example, they have a ``ppf`` method). + + """ + if isinstance(dist, rv_generic): + pass + elif isinstance(dist, string_types): + try: + dist = getattr(distributions, dist) + except AttributeError: + raise ValueError("%s is not a valid distribution name" % dist) + elif enforce_subclass: + msg = ("`dist` should be a stats.distributions instance or a string " + "with the name of such a distribution.") + raise ValueError(msg) + + return dist + + +def probplot(x, sparams=(), dist='norm', fit=True, plot=None): + """ + Calculate quantiles for a probability plot, and optionally show the plot. + + Generates a probability plot of sample data against the quantiles of a + specified theoretical distribution (the normal distribution by default). + `probplot` optionally calculates a best-fit line for the data and plots the + results using Matplotlib or a given plot function. + + Parameters + ---------- + x : array_like + Sample/response data from which `probplot` creates the plot. + sparams : tuple, optional + Distribution-specific shape parameters (shape parameters plus location + and scale). + dist : str or stats.distributions instance, optional + Distribution or distribution function name. The default is 'norm' for a + normal probability plot. Objects that look enough like a + stats.distributions instance (i.e. they have a ``ppf`` method) are also + accepted. + fit : bool, optional + Fit a least-squares regression (best-fit) line to the sample data if + True (default). + plot : object, optional + If given, plots the quantiles and least squares fit. + `plot` is an object that has to have methods "plot" and "text". + The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, + or a custom object with the same methods. + Default is None, which means that no plot is created. + + Returns + ------- + (osm, osr) : tuple of ndarrays + Tuple of theoretical quantiles (osm, or order statistic medians) and + ordered responses (osr). `osr` is simply sorted input `x`. + For details on how `osm` is calculated see the Notes section. + (slope, intercept, r) : tuple of floats, optional + Tuple containing the result of the least-squares fit, if that is + performed by `probplot`. `r` is the square root of the coefficient of + determination. If ``fit=False`` and ``plot=None``, this tuple is not + returned. + + Notes + ----- + Even if `plot` is given, the figure is not shown or saved by `probplot`; + ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after + calling `probplot`. + + `probplot` generates a probability plot, which should not be confused with + a Q-Q or a P-P plot. Statsmodels has more extensive functionality of this + type, see ``statsmodels.api.ProbPlot``. + + The formula used for the theoretical quantiles (horizontal axis of the + probability plot) is Filliben's estimate:: + + quantiles = dist.ppf(val), for + + 0.5**(1/n), for i = n + val = (i - 0.3175) / (n + 0.365), for i = 2, ..., n-1 + 1 - 0.5**(1/n), for i = 1 + + where ``i`` indicates the i-th ordered value and ``n`` is the total number + of values. + + Examples + -------- + >>> from scipy import stats + >>> import matplotlib.pyplot as plt + >>> nsample = 100 + >>> np.random.seed(7654321) + + A t distribution with small degrees of freedom: + + >>> ax1 = plt.subplot(221) + >>> x = stats.t.rvs(3, size=nsample) + >>> res = stats.probplot(x, plot=plt) + + A t distribution with larger degrees of freedom: + + >>> ax2 = plt.subplot(222) + >>> x = stats.t.rvs(25, size=nsample) + >>> res = stats.probplot(x, plot=plt) + + A mixture of two normal distributions with broadcasting: + + >>> ax3 = plt.subplot(223) + >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5], + ... size=(nsample/2.,2)).ravel() + >>> res = stats.probplot(x, plot=plt) + + A standard normal distribution: + + >>> ax4 = plt.subplot(224) + >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample) + >>> res = stats.probplot(x, plot=plt) + + Produce a new figure with a loggamma distribution, using the ``dist`` and + ``sparams`` keywords: + + >>> fig = plt.figure() + >>> ax = fig.add_subplot(111) + >>> x = stats.loggamma.rvs(c=2.5, size=500) + >>> stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax) + >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5") + + Show the results with Matplotlib: + + >>> plt.show() + + """ + x = np.asarray(x) + osm_uniform = _calc_uniform_order_statistic_medians(x) + dist = _parse_dist_kw(dist, enforce_subclass=False) + if sparams is None: + sparams = () + if isscalar(sparams): + sparams = (sparams,) + if not isinstance(sparams, tuple): + sparams = tuple(sparams) + + osm = dist.ppf(osm_uniform, *sparams) + osr = sort(x) + if fit or (plot is not None): + # perform a linear fit. + slope, intercept, r, prob, sterrest = stats.linregress(osm, osr) + + if plot is not None: + plot.plot(osm, osr, 'bo', osm, slope*osm + intercept, 'r-') + try: + if hasattr(plot, 'set_title'): + # Matplotlib Axes instance or something that looks like it + plot.set_title('Probability Plot') + plot.set_xlabel('Quantiles') + plot.set_ylabel('Ordered Values') + else: + # matplotlib.pyplot module + plot.title('Probability Plot') + plot.xlabel('Quantiles') + plot.ylabel('Ordered Values') + except: + # Not an MPL object or something that looks (enough) like it. + # Don't crash on adding labels or title + pass + + # Add R^2 value to the plot as text + xmin = amin(osm) + xmax = amax(osm) + ymin = amin(x) + ymax = amax(x) + posx = xmin + 0.70 * (xmax - xmin) + posy = ymin + 0.01 * (ymax - ymin) + plot.text(posx, posy, "$R^2=%1.4f$" % r) + + if fit: + return (osm, osr), (slope, intercept, r) + else: + return osm, osr + + +def ppcc_max(x, brack=(0.0,1.0), dist='tukeylambda'): + """Returns the shape parameter that maximizes the probability plot + correlation coefficient for the given data to a one-parameter + family of distributions. + + See also ppcc_plot + """ + dist = _parse_dist_kw(dist) + osm_uniform = _calc_uniform_order_statistic_medians(x) + osr = sort(x) + + # this function computes the x-axis values of the probability plot + # and computes a linear regression (including the correlation) + # and returns 1-r so that a minimization function maximizes the + # correlation + def tempfunc(shape, mi, yvals, func): + xvals = func(mi, shape) + r, prob = stats.pearsonr(xvals, yvals) + return 1-r + + return optimize.brent(tempfunc, brack=brack, args=(osm_uniform, osr, dist.ppf)) + + +def ppcc_plot(x,a,b,dist='tukeylambda', plot=None, N=80): + """Returns (shape, ppcc), and optionally plots shape vs. ppcc + (probability plot correlation coefficient) as a function of shape + parameter for a one-parameter family of distributions from shape + value a to b. + + See also ppcc_max + """ + svals = r_[a:b:complex(N)] + ppcc = svals*0.0 + k = 0 + for sval in svals: + r1,r2 = probplot(x,sval,dist=dist,fit=1) + ppcc[k] = r2[-1] + k += 1 + if plot is not None: + plot.plot(svals, ppcc, 'x') + plot.title('(%s) PPCC Plot' % dist) + plot.xlabel('Prob Plot Corr. Coef.') + plot.ylabel('Shape Values') + return svals, ppcc + + +def boxcox_llf(lmb, data): + r"""The boxcox log-likelihood function. + + Parameters + ---------- + lmb : scalar + Parameter for Box-Cox transformation. See `boxcox` for details. + data : array_like + Data to calculate Box-Cox log-likelihood for. If `data` is + multi-dimensional, the log-likelihood is calculated along the first + axis. + + Returns + ------- + llf : float or ndarray + Box-Cox log-likelihood of `data` given `lmb`. A float for 1-D `data`, + an array otherwise. + + See Also + -------- + boxcox, probplot, boxcox_normplot, boxcox_normmax + + Notes + ----- + The Box-Cox log-likelihood function is defined here as + + .. math:: + + llf = (\lambda - 1) \sum_i(\log(x_i)) - + N/2 \log(\sum_i (y_i - \bar{y})^2 / N), + + where ``y`` is the Box-Cox transformed input data ``x``. + + Examples + -------- + >>> from scipy import stats + >>> import matplotlib.pyplot as plt + >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes + >>> np.random.seed(1245) + + Generate some random variates and calculate Box-Cox log-likelihood values + for them for a range of ``lmbda`` values: + + >>> x = stats.loggamma.rvs(5, loc=10, size=1000) + >>> lmbdas = np.linspace(-2, 10) + >>> llf = np.zeros(lmbdas.shape, dtype=np.float) + >>> for ii, lmbda in enumerate(lmbdas): + ... llf[ii] = stats.boxcox_llf(lmbda, x) + + Also find the optimal lmbda value with `boxcox`: + + >>> x_most_normal, lmbda_optimal = stats.boxcox(x) + + Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a + horizontal line to check that that's really the optimum: + + >>> fig = plt.figure() + >>> ax = fig.add_subplot(111) + >>> ax.plot(lmbdas, llf, 'b.-') + >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r') + >>> ax.set_xlabel('lmbda parameter') + >>> ax.set_ylabel('Box-Cox log-likelihood') + + Now add some probability plots to show that where the log-likelihood is + maximized the data transformed with `boxcox` looks closest to normal: + + >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right' + >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs): + ... xt = stats.boxcox(x, lmbda=lmbda) + ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt) + ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc) + ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-') + ... ax_inset.set_xticklabels([]) + ... ax_inset.set_yticklabels([]) + ... ax_inset.set_title('$\lambda=%1.2f$' % lmbda) + + >>> plt.show() + + """ + data = np.asarray(data) + N = data.shape[0] + if N == 0: + return np.nan + + y = boxcox(data, lmb) + y_mean = np.mean(y, axis=0) + llf = (lmb - 1) * np.sum(np.log(data), axis=0) + llf -= N / 2.0 * np.log(np.sum((y - y_mean)**2. / N, axis=0)) + return llf + + +def _boxcox_conf_interval(x, lmax, alpha): + # Need to find the lambda for which + # f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1 + fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1) + target = boxcox_llf(lmax, x) - fac + + def rootfunc(lmbda, data, target): + return boxcox_llf(lmbda, data) - target + + # Find positive endpoint of interval in which answer is to be found + newlm = lmax + 0.5 + N = 0 + while (rootfunc(newlm, x, target) > 0.0) and (N < 500): + newlm += 0.1 + N += 1 + + if N == 500: + raise RuntimeError("Could not find endpoint.") + + lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target)) + + # Now find negative interval in the same way + newlm = lmax - 0.5 + N = 0 + while (rootfunc(newlm, x, target) > 0.0) and (N < 500): + newlm -= 0.1 + N += 1 + + if N == 500: + raise RuntimeError("Could not find endpoint.") + + lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target)) + return lmminus, lmplus + + +def boxcox(x, lmbda=None, alpha=None): + r""" + Return a positive dataset transformed by a Box-Cox power transformation. + + Parameters + ---------- + x : ndarray + Input array. Should be 1-dimensional. + lmbda : {None, scalar}, optional + If `lmbda` is not None, do the transformation for that value. + + If `lmbda` is None, find the lambda that maximizes the log-likelihood + function and return it as the second output argument. + alpha : {None, float}, optional + If `alpha` is not None, return the ``100 * (1-alpha)%`` confidence + interval for `lmbda` as the third output argument. + Must be between 0.0 and 1.0. + + Returns + ------- + boxcox : ndarray + Box-Cox power transformed array. + maxlog : float, optional + If the `lmbda` parameter is None, the second returned argument is + the lambda that maximizes the log-likelihood function. + (min_ci, max_ci) : tuple of float, optional + If `lmbda` parameter is None and `alpha` is not None, this returned + tuple of floats represents the minimum and maximum confidence limits + given `alpha`. + + See Also + -------- + probplot, boxcox_normplot, boxcox_normmax, boxcox_llf + + Notes + ----- + The Box-Cox transform is given by:: + + y = (x**lmbda - 1) / lmbda, for lmbda > 0 + log(x), for lmbda = 0 + + `boxcox` requires the input data to be positive. Sometimes a Box-Cox + transformation provides a shift parameter to achieve this; `boxcox` does + not. Such a shift parameter is equivalent to adding a positive constant to + `x` before calling `boxcox`. + + The confidence limits returned when `alpha` is provided give the interval + where: + + .. math:: + + llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1), + + with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared + function. + + References + ---------- + G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the + Royal Statistical Society B, 26, 211-252 (1964). + + Examples + -------- + >>> from scipy import stats + >>> import matplotlib.pyplot as plt + + We generate some random variates from a non-normal distribution and make a + probability plot for it, to show it is non-normal in the tails: + + >>> fig = plt.figure() + >>> ax1 = fig.add_subplot(211) + >>> x = stats.loggamma.rvs(5, size=500) + 5 + >>> stats.probplot(x, dist=stats.norm, plot=ax1) + >>> ax1.set_xlabel('') + >>> ax1.set_title('Probplot against normal distribution') + + We now use `boxcox` to transform the data so it's closest to normal: + + >>> ax2 = fig.add_subplot(212) + >>> xt, _ = stats.boxcox(x) + >>> stats.probplot(xt, dist=stats.norm, plot=ax2) + >>> ax2.set_title('Probplot after Box-Cox transformation') + + >>> plt.show() + + """ + x = np.asarray(x) + if x.size == 0: + return x + + if any(x <= 0): + raise ValueError("Data must be positive.") + + if lmbda is not None: # single transformation + return special.boxcox(x, lmbda) + + # If lmbda=None, find the lmbda that maximizes the log-likelihood function. + lmax = boxcox_normmax(x, method='mle') + y = boxcox(x, lmax) + + if alpha is None: + return y, lmax + else: + # Find confidence interval + interval = _boxcox_conf_interval(x, lmax, alpha) + return y, lmax, interval + + +def boxcox_normmax(x, brack=(-2.0, 2.0), method='pearsonr'): + """Compute optimal Box-Cox transform parameter for input data. + + Parameters + ---------- + x : array_like + Input array. + brack : 2-tuple, optional + The starting interval for a downhill bracket search with + `optimize.brent`. Note that this is in most cases not critical; the + final result is allowed to be outside this bracket. + method : str, optional + The method to determine the optimal transform parameter (`boxcox` + ``lmbda`` parameter). Options are: + + 'pearsonr' (default) + Maximizes the Pearson correlation coefficient between + ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be + normally-distributed. + + 'mle' + Minimizes the log-likelihood `boxcox_llf`. This is the method used + in `boxcox`. + + 'all' + Use all optimization methods available, and return all results. + Useful to compare different methods. + + Returns + ------- + maxlog : float or ndarray + The optimal transform parameter found. An array instead of a scalar + for ``method='all'``. + + See Also + -------- + boxcox, boxcox_llf, boxcox_normplot + + Examples + -------- + >>> from scipy import stats + >>> import matplotlib.pyplot as plt + >>> np.random.seed(1234) # make this example reproducible + + Generate some data and determine optimal ``lmbda`` in various ways: + + >>> x = stats.loggamma.rvs(5, size=30) + 5 + >>> y, lmax_mle = stats.boxcox(x) + >>> lmax_pearsonr = stats.boxcox_normmax(x) + + >>> lmax_mle + 7.177... + >>> lmax_pearsonr + 7.916... + >>> stats.boxcox_normmax(x, method='all') + array([ 7.91667384, 7.17718692]) + + >>> fig = plt.figure() + >>> ax = fig.add_subplot(111) + >>> stats.boxcox_normplot(x, -10, 10, plot=ax) + >>> ax.axvline(lmax_mle, color='r') + >>> ax.axvline(lmax_pearsonr, color='g', ls='--') + + >>> plt.show() + + """ + def _pearsonr(x, brack): + osm_uniform = _calc_uniform_order_statistic_medians(x) + xvals = distributions.norm.ppf(osm_uniform) + + def _eval_pearsonr(lmbda, xvals, samps): + # This function computes the x-axis values of the probability plot + # and computes a linear regression (including the correlation) and + # returns ``1 - r`` so that a minimization function maximizes the + # correlation. + y = boxcox(samps, lmbda) + yvals = np.sort(y) + r, prob = stats.pearsonr(xvals, yvals) + return 1 - r + + return optimize.brent(_eval_pearsonr, brack=brack, args=(xvals, x)) + + def _mle(x, brack): + def _eval_mle(lmb, data): + # function to minimize + return -boxcox_llf(lmb, data) + + return optimize.brent(_eval_mle, brack=brack, args=(x,)) + + def _all(x, brack): + maxlog = np.zeros(2, dtype=np.float) + maxlog[0] = _pearsonr(x, brack) + maxlog[1] = _mle(x, brack) + return maxlog + + methods = {'pearsonr': _pearsonr, + 'mle': _mle, + 'all': _all} + if not method in methods.keys(): + raise ValueError("Method %s not recognized." % method) + + optimfunc = methods[method] + return optimfunc(x, brack) + + +def boxcox_normplot(x, la, lb, plot=None, N=80): + """Compute parameters for a Box-Cox normality plot, optionally show it. + + A Box-Cox normality plot shows graphically what the best transformation + parameter is to use in `boxcox` to obtain a distribution that is close + to normal. + + Parameters + ---------- + x : array_like + Input array. + la, lb : scalar + The lower and upper bounds for the ``lmbda`` values to pass to `boxcox` + for Box-Cox transformations. These are also the limits of the + horizontal axis of the plot if that is generated. + plot : object, optional + If given, plots the quantiles and least squares fit. + `plot` is an object that has to have methods "plot" and "text". + The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, + or a custom object with the same methods. + Default is None, which means that no plot is created. + N : int, optional + Number of points on the horizontal axis (equally distributed from + `la` to `lb`). + + Returns + ------- + lmbdas : ndarray + The ``lmbda`` values for which a Box-Cox transform was done. + ppcc : ndarray + Probability Plot Correlelation Coefficient, as obtained from `probplot` + when fitting the Box-Cox transformed input `x` against a normal + distribution. + + See Also + -------- + probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max + + Notes + ----- + Even if `plot` is given, the figure is not shown or saved by + `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')`` + should be used after calling `probplot`. + + Examples + -------- + >>> from scipy import stats + >>> import matplotlib.pyplot as plt + + Generate some non-normally distributed data, and create a Box-Cox plot: + + >>> x = stats.loggamma.rvs(5, size=500) + 5 + >>> fig = plt.figure() + >>> ax = fig.add_subplot(111) + >>> stats.boxcox_normplot(x, -20, 20, plot=ax) + + Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in + the same plot: + + >>> _, maxlog = stats.boxcox(x) + >>> ax.axvline(maxlog, color='r') + + >>> plt.show() + + """ + x = np.asarray(x) + if x.size == 0: + return x + + if lb <= la: + raise ValueError("`lb` has to be larger than `la`.") + + lmbdas = np.linspace(la, lb, num=N) + ppcc = lmbdas * 0.0 + for i, val in enumerate(lmbdas): + # Determine for each lmbda the correlation coefficient of transformed x + z = boxcox(x, lmbda=val) + _, r2 = probplot(z, dist='norm', fit=True) + ppcc[i] = r2[-1] + + if plot is not None: + plot.plot(lmbdas, ppcc, 'x') + try: + if hasattr(plot, 'set_title'): + # Matplotlib Axes instance or something that looks like it + plot.set_title('Box-Cox Normality Plot') + plot.set_ylabel('Prob Plot Corr. Coef.') + plot.set_xlabel('$\lambda$') + else: + # matplotlib.pyplot module + plot.title('Box-Cox Normality Plot') + plot.ylabel('Prob Plot Corr. Coef.') + plot.xlabel('$\lambda$') + except Exception: + # Not an MPL object or something that looks (enough) like it. + # Don't crash on adding labels or title + pass + + return lmbdas, ppcc + + +def shapiro(x, a=None, reta=False): + """ + Perform the Shapiro-Wilk test for normality. + + The Shapiro-Wilk test tests the null hypothesis that the + data was drawn from a normal distribution. + + Parameters + ---------- + x : array_like + Array of sample data. + a : array_like, optional + Array of internal parameters used in the calculation. If these + are not given, they will be computed internally. If x has length + n, then a must have length n/2. + reta : bool, optional + Whether or not to return the internally computed a values. The + default is False. + + Returns + ------- + W : float + The test statistic. + p-value : float + The p-value for the hypothesis test. + a : array_like, optional + If `reta` is True, then these are the internally computed "a" + values that may be passed into this function on future calls. + + See Also + -------- + anderson : The Anderson-Darling test for normality + + References + ---------- + .. [1] http://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm + + """ + N = len(x) + if N < 3: + raise ValueError("Data must be at least length 3.") + if a is None: + a = zeros(N,'f') + init = 0 + else: + if len(a) != N//2: + raise ValueError("len(a) must equal len(x)/2") + init = 1 + y = sort(x) + a, w, pw, ifault = statlib.swilk(y, a[:N//2], init) + if not ifault in [0,2]: + warnings.warn(str(ifault)) + if N > 5000: + warnings.warn("p-value may not be accurate for N > 5000.") + if reta: + return w, pw, a + else: + return w, pw + +# Values from Stephens, M A, "EDF Statistics for Goodness of Fit and +# Some Comparisons", Journal of he American Statistical +# Association, Vol. 69, Issue 347, Sept. 1974, pp 730-737 +_Avals_norm = array([0.576, 0.656, 0.787, 0.918, 1.092]) +_Avals_expon = array([0.922, 1.078, 1.341, 1.606, 1.957]) +# From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution", +# Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588. +_Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038]) +# From Stephens, M A, "Tests of Fit for the Logistic Distribution Based +# on the Empirical Distribution Function.", Biometrika, +# Vol. 66, Issue 3, Dec. 1979, pp 591-595. +_Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010]) + + +def anderson(x,dist='norm'): + """ + Anderson-Darling test for data coming from a particular distribution + + The Anderson-Darling test is a modification of the Kolmogorov- + Smirnov test kstest_ for the null hypothesis that a sample is + drawn from a population that follows a particular distribution. + For the Anderson-Darling test, the critical values depend on + which distribution is being tested against. This function works + for normal, exponential, logistic, or Gumbel (Extreme Value + Type I) distributions. + + Parameters + ---------- + x : array_like + array of sample data + dist : {'norm','expon','logistic','gumbel','extreme1'}, optional + the type of distribution to test against. The default is 'norm' + and 'extreme1' is a synonym for 'gumbel' + + Returns + ------- + A2 : float + The Anderson-Darling test statistic + critical : list + The critical values for this distribution + sig : list + The significance levels for the corresponding critical values + in percents. The function returns critical values for a + differing set of significance levels depending on the + distribution that is being tested against. + + Notes + ----- + Critical values provided are for the following significance levels: + + normal/exponenential + 15%, 10%, 5%, 2.5%, 1% + logistic + 25%, 10%, 5%, 2.5%, 1%, 0.5% + Gumbel + 25%, 10%, 5%, 2.5%, 1% + + If A2 is larger than these critical values then for the corresponding + significance level, the null hypothesis that the data come from the + chosen distribution can be rejected. + + References + ---------- + .. [1] http://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm + .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and + Some Comparisons, Journal of the American Statistical Association, + Vol. 69, pp. 730-737. + .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit + Statistics with Unknown Parameters, Annals of Statistics, Vol. 4, + pp. 357-369. + .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value + Distribution, Biometrika, Vol. 64, pp. 583-588. + .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference + to Tests for Exponentiality , Technical Report No. 262, + Department of Statistics, Stanford University, Stanford, CA. + .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution + Based on the Empirical Distribution Function, Biometrika, Vol. 66, + pp. 591-595. + + """ + if not dist in ['norm','expon','gumbel','extreme1','logistic']: + raise ValueError("Invalid distribution; dist must be 'norm', " + "'expon', 'gumbel', 'extreme1' or 'logistic'.") + y = sort(x) + xbar = np.mean(x, axis=0) + N = len(y) + if dist == 'norm': + s = np.std(x, ddof=1, axis=0) + w = (y-xbar)/s + z = distributions.norm.cdf(w) + sig = array([15,10,5,2.5,1]) + critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N),3) + elif dist == 'expon': + w = y / xbar + z = distributions.expon.cdf(w) + sig = array([15,10,5,2.5,1]) + critical = around(_Avals_expon / (1.0 + 0.6/N),3) + elif dist == 'logistic': + def rootfunc(ab,xj,N): + a,b = ab + tmp = (xj-a)/b + tmp2 = exp(tmp) + val = [sum(1.0/(1+tmp2),axis=0)-0.5*N, + sum(tmp*(1.0-tmp2)/(1+tmp2),axis=0)+N] + return array(val) + sol0 = array([xbar,np.std(x, ddof=1, axis=0)]) + sol = optimize.fsolve(rootfunc,sol0,args=(x,N),xtol=1e-5) + w = (y-sol[0])/sol[1] + z = distributions.logistic.cdf(w) + sig = array([25,10,5,2.5,1,0.5]) + critical = around(_Avals_logistic / (1.0+0.25/N),3) + else: # (dist == 'gumbel') or (dist == 'extreme1'): + # the following is incorrect, see ticket:1097 +## def fixedsolve(th,xj,N): +## val = stats.sum(xj)*1.0/N +## tmp = exp(-xj/th) +## term = sum(xj*tmp,axis=0) +## term /= sum(tmp,axis=0) +## return val - term +## s = optimize.fixed_point(fixedsolve, 1.0, args=(x,N),xtol=1e-5) +## xbar = -s*log(sum(exp(-x/s),axis=0)*1.0/N) + xbar, s = distributions.gumbel_l.fit(x) + w = (y-xbar)/s + z = distributions.gumbel_l.cdf(w) + sig = array([25,10,5,2.5,1]) + critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)),3) + + i = arange(1,N+1) + S = sum((2*i-1.0)/N*(log(z)+log(1-z[::-1])),axis=0) + A2 = -N-S + return A2, critical, sig + + +def ansari(x,y): + """ + Perform the Ansari-Bradley test for equal scale parameters + + The Ansari-Bradley test is a non-parametric test for the equality + of the scale parameter of the distributions from which two + samples were drawn. + + Parameters + ---------- + x, y : array_like + arrays of sample data + + Returns + ------- + AB : float + The Ansari-Bradley test statistic + p-value : float + The p-value of the hypothesis test + + See Also + -------- + fligner : A non-parametric test for the equality of k variances + mood : A non-parametric test for the equality of two scale parameters + + Notes + ----- + The p-value given is exact when the sample sizes are both less than + 55 and there are no ties, otherwise a normal approximation for the + p-value is used. + + References + ---------- + .. [1] Sprent, Peter and N.C. Smeeton. Applied nonparametric statistical + methods. 3rd ed. Chapman and Hall/CRC. 2001. Section 5.8.2. + + """ + x,y = asarray(x),asarray(y) + n = len(x) + m = len(y) + if m < 1: + raise ValueError("Not enough other observations.") + if n < 1: + raise ValueError("Not enough test observations.") + N = m+n + xy = r_[x,y] # combine + rank = stats.rankdata(xy) + symrank = amin(array((rank,N-rank+1)),0) + AB = sum(symrank[:n],axis=0) + uxy = unique(xy) + repeats = (len(uxy) != len(xy)) + exact = ((m < 55) and (n < 55) and not repeats) + if repeats and ((m < 55) or (n < 55)): + warnings.warn("Ties preclude use of exact statistic.") + if exact: + astart, a1, ifault = statlib.gscale(n,m) + ind = AB-astart + total = sum(a1,axis=0) + if ind < len(a1)/2.0: + cind = int(ceil(ind)) + if (ind == cind): + pval = 2.0*sum(a1[:cind+1],axis=0)/total + else: + pval = 2.0*sum(a1[:cind],axis=0)/total + else: + find = int(floor(ind)) + if (ind == floor(ind)): + pval = 2.0*sum(a1[find:],axis=0)/total + else: + pval = 2.0*sum(a1[find+1:],axis=0)/total + return AB, min(1.0,pval) + + # otherwise compute normal approximation + if N % 2: # N odd + mnAB = n*(N+1.0)**2 / 4.0 / N + varAB = n*m*(N+1.0)*(3+N**2)/(48.0*N**2) + else: + mnAB = n*(N+2.0)/4.0 + varAB = m*n*(N+2)*(N-2.0)/48/(N-1.0) + if repeats: # adjust variance estimates + # compute sum(tj * rj**2,axis=0) + fac = sum(symrank**2,axis=0) + if N % 2: # N odd + varAB = m*n*(16*N*fac-(N+1)**4)/(16.0 * N**2 * (N-1)) + else: # N even + varAB = m*n*(16*fac-N*(N+2)**2)/(16.0 * N * (N-1)) + z = (AB - mnAB)/sqrt(varAB) + pval = distributions.norm.sf(abs(z)) * 2.0 + return AB, pval + + +def bartlett(*args): + """ + Perform Bartlett's test for equal variances + + Bartlett's test tests the null hypothesis that all input samples + are from populations with equal variances. For samples + from significantly non-normal populations, Levene's test + `levene`_ is more robust. + + Parameters + ---------- + sample1, sample2,... : array_like + arrays of sample data. May be different lengths. + + Returns + ------- + T : float + The test statistic. + p-value : float + The p-value of the test. + + References + ---------- + .. [1] http://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm + + .. [2] Snedecor, George W. and Cochran, William G. (1989), Statistical + Methods, Eighth Edition, Iowa State University Press. + + """ + k = len(args) + if k < 2: + raise ValueError("Must enter at least two input sample vectors.") + Ni = zeros(k) + ssq = zeros(k,'d') + for j in range(k): + Ni[j] = len(args[j]) + ssq[j] = np.var(args[j], ddof=1) + Ntot = sum(Ni,axis=0) + spsq = sum((Ni-1)*ssq,axis=0)/(1.0*(Ntot-k)) + numer = (Ntot*1.0-k)*log(spsq) - sum((Ni-1.0)*log(ssq),axis=0) + denom = 1.0 + (1.0/(3*(k-1)))*((sum(1.0/(Ni-1.0),axis=0))-1.0/(Ntot-k)) + T = numer / denom + pval = distributions.chi2.sf(T,k-1) # 1 - cdf + return T, pval + + +def levene(*args,**kwds): + """ + Perform Levene test for equal variances. + + The Levene test tests the null hypothesis that all input samples + are from populations with equal variances. Levene's test is an + alternative to Bartlett's test `bartlett` in the case where + there are significant deviations from normality. + + Parameters + ---------- + sample1, sample2, ... : array_like + The sample data, possibly with different lengths + center : {'mean', 'median', 'trimmed'}, optional + Which function of the data to use in the test. The default + is 'median'. + proportiontocut : float, optional + When `center` is 'trimmed', this gives the proportion of data points + to cut from each end. (See `scipy.stats.trim_mean`.) + Default is 0.05. + + Returns + ------- + W : float + The test statistic. + p-value : float + The p-value for the test. + + Notes + ----- + Three variations of Levene's test are possible. The possibilities + and their recommended usages are: + + * 'median' : Recommended for skewed (non-normal) distributions> + * 'mean' : Recommended for symmetric, moderate-tailed distributions. + * 'trimmed' : Recommended for heavy-tailed distributions. + + References + ---------- + .. [1] http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm + .. [2] Levene, H. (1960). In Contributions to Probability and Statistics: + Essays in Honor of Harold Hotelling, I. Olkin et al. eds., + Stanford University Press, pp. 278-292. + .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American + Statistical Association, 69, 364-367 + + """ + # Handle keyword arguments. + center = 'median' + proportiontocut = 0.05 + for kw, value in kwds.items(): + if kw not in ['center', 'proportiontocut']: + raise TypeError("levene() got an unexpected keyword argument '%s'" % kw) + if kw == 'center': + center = value + else: + proportiontocut = value + + k = len(args) + if k < 2: + raise ValueError("Must enter at least two input sample vectors.") + Ni = zeros(k) + Yci = zeros(k,'d') + + if not center in ['mean','median','trimmed']: + raise ValueError("Keyword argument
must be 'mean', 'median'" + + "or 'trimmed'.") + + if center == 'median': + func = lambda x: np.median(x, axis=0) + elif center == 'mean': + func = lambda x: np.mean(x, axis=0) + else: # center == 'trimmed' + args = tuple(stats.trimboth(np.sort(arg), proportiontocut) for arg in args) + func = lambda x: np.mean(x, axis=0) + + for j in range(k): + Ni[j] = len(args[j]) + Yci[j] = func(args[j]) + Ntot = sum(Ni,axis=0) + + # compute Zij's + Zij = [None]*k + for i in range(k): + Zij[i] = abs(asarray(args[i])-Yci[i]) + # compute Zbari + Zbari = zeros(k,'d') + Zbar = 0.0 + for i in range(k): + Zbari[i] = np.mean(Zij[i], axis=0) + Zbar += Zbari[i]*Ni[i] + Zbar /= Ntot + + numer = (Ntot-k)*sum(Ni*(Zbari-Zbar)**2,axis=0) + + # compute denom_variance + dvar = 0.0 + for i in range(k): + dvar += sum((Zij[i]-Zbari[i])**2,axis=0) + + denom = (k-1.0)*dvar + + W = numer / denom + pval = distributions.f.sf(W,k-1,Ntot-k) # 1 - cdf + return W, pval + + +@setastest(False) +def binom_test(x,n=None,p=0.5): + """ + Perform a test that the probability of success is p. + + This is an exact, two-sided test of the null hypothesis + that the probability of success in a Bernoulli experiment + is `p`. + + Parameters + ---------- + x : integer or array_like + the number of successes, or if x has length 2, it is the + number of successes and the number of failures. + n : integer + the number of trials. This is ignored if x gives both the + number of successes and failures + p : float, optional + The hypothesized probability of success. 0 <= p <= 1. The + default value is p = 0.5 + + Returns + ------- + p-value : float + The p-value of the hypothesis test + + References + ---------- + .. [1] http://en.wikipedia.org/wiki/Binomial_test + + """ + x = atleast_1d(x).astype(np.integer) + if len(x) == 2: + n = x[1]+x[0] + x = x[0] + elif len(x) == 1: + x = x[0] + if n is None or n < x: + raise ValueError("n must be >= x") + n = np.int_(n) + else: + raise ValueError("Incorrect length for x.") + + if (p > 1.0) or (p < 0.0): + raise ValueError("p must be in range [0,1]") + + d = distributions.binom.pmf(x,n,p) + rerr = 1+1e-7 + if (x == p*n): + # special case as shortcut, would also be handled by `else` below + pval = 1. + elif (x < p*n): + i = np.arange(np.ceil(p*n),n+1) + y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0) + pval = distributions.binom.cdf(x,n,p) + distributions.binom.sf(n-y,n,p) + else: + i = np.arange(np.floor(p*n) + 1) + y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0) + pval = distributions.binom.cdf(y-1,n,p) + distributions.binom.sf(x-1,n,p) + + return min(1.0,pval) + + +def _apply_func(x,g,func): + # g is list of indices into x + # separating x into different groups + # func should be applied over the groups + g = unique(r_[0,g,len(x)]) + output = [] + for k in range(len(g)-1): + output.append(func(x[g[k]:g[k+1]])) + return asarray(output) + + +def fligner(*args,**kwds): + """ + Perform Fligner's test for equal variances. + + Fligner's test tests the null hypothesis that all input samples + are from populations with equal variances. Fligner's test is + non-parametric in contrast to Bartlett's test `bartlett` and + Levene's test `levene`. + + Parameters + ---------- + sample1, sample2, ... : array_like + arrays of sample data. Need not be the same length + center : {'mean', 'median', 'trimmed'}, optional + keyword argument controlling which function of the data + is used in computing the test statistic. The default + is 'median'. + proportiontocut : float, optional + When `center` is 'trimmed', this gives the proportion of data points + to cut from each end. (See `scipy.stats.trim_mean`.) + Default is 0.05. + + Returns + ------- + Xsq : float + the test statistic + p-value : float + the p-value for the hypothesis test + + Notes + ----- + As with Levene's test there are three variants + of Fligner's test that differ by the measure of central + tendency used in the test. See `levene` for more information. + + References + ---------- + .. [1] http://www.stat.psu.edu/~bgl/center/tr/TR993.ps + + .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample + tests for scale. 'Journal of the American Statistical Association.' + 71(353), 210-213. + + """ + # Handle keyword arguments. + center = 'median' + proportiontocut = 0.05 + for kw, value in kwds.items(): + if kw not in ['center', 'proportiontocut']: + raise TypeError("fligner() got an unexpected keyword argument '%s'" % kw) + if kw == 'center': + center = value + else: + proportiontocut = value + + k = len(args) + if k < 2: + raise ValueError("Must enter at least two input sample vectors.") + + if not center in ['mean','median','trimmed']: + raise ValueError("Keyword argument
must be 'mean', 'median'" + + "or 'trimmed'.") + + if center == 'median': + func = lambda x: np.median(x, axis=0) + elif center == 'mean': + func = lambda x: np.mean(x, axis=0) + else: # center == 'trimmed' + args = tuple(stats.trimboth(arg, proportiontocut) for arg in args) + func = lambda x: np.mean(x, axis=0) + + Ni = asarray([len(args[j]) for j in range(k)]) + Yci = asarray([func(args[j]) for j in range(k)]) + Ntot = sum(Ni,axis=0) + # compute Zij's + Zij = [abs(asarray(args[i])-Yci[i]) for i in range(k)] + allZij = [] + g = [0] + for i in range(k): + allZij.extend(list(Zij[i])) + g.append(len(allZij)) + + ranks = stats.rankdata(allZij) + a = distributions.norm.ppf(ranks/(2*(Ntot+1.0)) + 0.5) + + # compute Aibar + Aibar = _apply_func(a,g,sum) / Ni + anbar = np.mean(a, axis=0) + varsq = np.var(a,axis=0, ddof=1) + Xsq = sum(Ni*(asarray(Aibar)-anbar)**2.0,axis=0)/varsq + pval = distributions.chi2.sf(Xsq,k-1) # 1 - cdf + return Xsq, pval + + +def mood(x, y, axis=0): + """ + Perform Mood's test for equal scale parameters. + + Mood's two-sample test for scale parameters is a non-parametric + test for the null hypothesis that two samples are drawn from the + same distribution with the same scale parameter. + + Parameters + ---------- + x, y : array_like + Arrays of sample data. + axis: int, optional + The axis along which the samples are tested. `x` and `y` can be of + different length along `axis`. + If `axis` is None, `x` and `y` are flattened and the test is done on + all values in the flattened arrays. + + Returns + ------- + z : scalar or ndarray + The z-score for the hypothesis test. For 1-D inputs a scalar is + returned; + p-value : scalar ndarray + The p-value for the hypothesis test. + + See Also + -------- + fligner : A non-parametric test for the equality of k variances + ansari : A non-parametric test for the equality of 2 variances + bartlett : A parametric test for equality of k variances in normal samples + levene : A parametric test for equality of k variances + + Notes + ----- + The data are assumed to be drawn from probability distributions ``f(x)`` + and ``f(x/s) / s`` respectively, for some probability density function f. + The null hypothesis is that ``s == 1``. + + For multi-dimensional arrays, if the inputs are of shapes + ``(n0, n1, n2, n3)`` and ``(n0, m1, n2, n3)``, then if ``axis=1``, the + resulting z and p values will have shape ``(n0, n2, n3)``. Note that + ``n1`` and ``m1`` don't have to be equal, but the other dimensions do. + + Examples + -------- + >>> from scipy import stats + >>> x2 = np.random.randn(2, 45, 6, 7) + >>> x1 = np.random.randn(2, 30, 6, 7) + >>> z, p = stats.mood(x1, x2, axis=1) + >>> p.shape + (2, 6, 7) + + Find the number of points where the difference in scale is not significant: + + >>> (p > 0.1).sum() + 74 + + Perform the test with different scales: + + >>> x1 = np.random.randn(2, 30) + >>> x2 = np.random.randn(2, 35) * 10.0 + >>> stats.mood(x1, x2, axis=1) + (array([-5.84332354, -5.6840814 ]), array([5.11694980e-09, 1.31517628e-08])) + + """ + x = np.asarray(x, dtype=float) + y = np.asarray(y, dtype=float) + + if axis is None: + x = x.flatten() + y = y.flatten() + axis = 0 + + # Determine shape of the result arrays + res_shape = tuple([x.shape[ax] for ax in range(len(x.shape)) if ax != axis]) + if not (res_shape == tuple([y.shape[ax] for ax in range(len(y.shape)) if + ax != axis])): + raise ValueError("Dimensions of x and y on all axes except `axis` " + "should match") + + n = x.shape[axis] + m = y.shape[axis] + N = m + n + if N < 3: + raise ValueError("Not enough observations.") + + xy = np.concatenate((x, y), axis=axis) + if axis != 0: + xy = np.rollaxis(xy, axis) + + xy = xy.reshape(xy.shape[0], -1) + + # Generalized to the n-dimensional case by adding the axis argument, and + # using for loops, since rankdata is not vectorized. For improving + # performance consider vectorizing rankdata function. + all_ranks = np.zeros_like(xy) + for j in range(xy.shape[1]): + all_ranks[:, j] = stats.rankdata(xy[:, j]) + + Ri = all_ranks[:n] + M = sum((Ri - (N + 1.0) / 2) ** 2, axis=0) + # Approx stat. + mnM = n * (N * N - 1.0) / 12 + varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180 + z = (M - mnM) / sqrt(varM) + + # sf for right tail, cdf for left tail. Factor 2 for two-sidedness + z_pos = z > 0 + pval = np.zeros_like(z) + pval[z_pos] = 2 * distributions.norm.sf(z[z_pos]) + pval[~z_pos] = 2 * distributions.norm.cdf(z[~z_pos]) + + if res_shape == (): + # Return scalars, not 0-D arrays + z = z[0] + pval = pval[0] + else: + z.shape = res_shape + pval.shape = res_shape + + return z, pval + + +def wilcoxon(x, y=None, zero_method="wilcox", correction=False): + """ + Calculate the Wilcoxon signed-rank test. + + The Wilcoxon signed-rank test tests the null hypothesis that two + related paired samples come from the same distribution. In particular, + it tests whether the distribution of the differences x - y is symmetric + about zero. It is a non-parametric version of the paired T-test. + + Parameters + ---------- + x : array_like + The first set of measurements. + y : array_like, optional + The second set of measurements. If `y` is not given, then the `x` + array is considered to be the differences between the two sets of + measurements. + zero_method : string, {"pratt", "wilcox", "zsplit"}, optional + "pratt": + Pratt treatment: includes zero-differences in the ranking process + (more conservative) + "wilcox": + Wilcox treatment: discards all zero-differences + "zsplit": + Zero rank split: just like Pratt, but spliting the zero rank + between positive and negative ones + correction : bool, optional + If True, apply continuity correction by adjusting the Wilcoxon rank + statistic by 0.5 towards the mean value when computing the + z-statistic. Default is False. + + Returns + ------- + T : float + The sum of the ranks of the differences above or below zero, whichever + is smaller. + p-value : float + The two-sided p-value for the test. + + Notes + ----- + Because the normal approximation is used for the calculations, the + samples used should be large. A typical rule is to require that + n > 20. + + References + ---------- + .. [1] http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test + + """ + + if not zero_method in ["wilcox", "pratt", "zsplit"]: + raise ValueError("Zero method should be either 'wilcox' \ + or 'pratt' or 'zsplit'") + + if y is None: + d = x + else: + x, y = map(asarray, (x, y)) + if len(x) != len(y): + raise ValueError('Unequal N in wilcoxon. Aborting.') + d = x-y + + if zero_method == "wilcox": + d = compress(not_equal(d, 0), d, axis=-1) # Keep all non-zero differences + + count = len(d) + if (count < 10): + warnings.warn("Warning: sample size too small for normal approximation.") + r = stats.rankdata(abs(d)) + r_plus = sum((d > 0) * r, axis=0) + r_minus = sum((d < 0) * r, axis=0) + + if zero_method == "zsplit": + r_zero = sum((d == 0) * r, axis=0) + r_plus += r_zero / 2. + r_minus += r_zero / 2. + + T = min(r_plus, r_minus) + mn = count*(count + 1.) * 0.25 + se = count*(count + 1.) * (2. * count + 1.) + + if zero_method == "pratt": + r = r[d != 0] + + replist, repnum = find_repeats(r) + if repnum.size != 0: + # Correction for repeated elements. + se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() + + se = sqrt(se / 24) + correction = 0.5 * int(bool(correction)) * np.sign(T - mn) + z = (T - mn - correction) / se + prob = 2. * distributions.norm.sf(abs(z)) + return T, prob + + +def _hermnorm(N): + # return the negatively normalized hermite polynomials up to order N-1 + # (inclusive) + # using the recursive relationship + # p_n+1 = p_n(x)' - x*p_n(x) + # and p_0(x) = 1 + plist = [None]*N + plist[0] = poly1d(1) + for n in range(1,N): + plist[n] = plist[n-1].deriv() - poly1d([1,0])*plist[n-1] + return plist + + +def pdf_fromgamma(g1,g2,g3=0.0,g4=None): + if g4 is None: + g4 = 3*g2*g2 + sigsq = 1.0/g2 + sig = sqrt(sigsq) + mu = g1*sig**3.0 + p12 = _hermnorm(13) + for k in range(13): + p12[k] = p12[k]/sig**k + + # Add all of the terms to polynomial + totp = p12[0] - (g1/6.0*p12[3]) + \ + (g2/24.0*p12[4] + g1*g1/72.0*p12[6]) - \ + (g3/120.0*p12[5] + g1*g2/144.0*p12[7] + g1**3.0/1296.0*p12[9]) + \ + (g4/720*p12[6] + (g2*g2/1152.0+g1*g3/720)*p12[8] + + g1*g1*g2/1728.0*p12[10] + g1**4.0/31104.0*p12[12]) + # Final normalization + totp = totp / sqrt(2*pi)/sig + + def thefunc(x): + xn = (x-mu)/sig + return totp(xn)*exp(-xn*xn/2.0) + return thefunc + + +def _circfuncs_common(samples, high, low): + samples = np.asarray(samples) + if samples.size == 0: + return np.nan, np.nan + + ang = (samples - low)*2*pi / (high-low) + return samples, ang + + +def circmean(samples, high=2*pi, low=0, axis=None): + """ + Compute the circular mean for samples in a range. + + Parameters + ---------- + samples : array_like + Input array. + high : float or int, optional + High boundary for circular mean range. Default is ``2*pi``. + low : float or int, optional + Low boundary for circular mean range. Default is 0. + axis : int, optional + Axis along which means are computed. The default is to compute + the mean of the flattened array. + + Returns + ------- + circmean : float + Circular mean. + + """ + samples, ang = _circfuncs_common(samples, high, low) + res = angle(np.mean(exp(1j*ang), axis=axis)) + mask = res < 0 + if (mask.ndim > 0): + res[mask] += 2*pi + elif mask: + res = res + 2*pi + + return res*(high-low)/2.0/pi + low + + +def circvar(samples, high=2*pi, low=0, axis=None): + """ + Compute the circular variance for samples assumed to be in a range + + Parameters + ---------- + samples : array_like + Input array. + low : float or int, optional + Low boundary for circular variance range. Default is 0. + high : float or int, optional + High boundary for circular variance range. Default is ``2*pi``. + axis : int, optional + Axis along which variances are computed. The default is to compute + the variance of the flattened array. + + Returns + ------- + circvar : float + Circular variance. + + Notes + ----- + This uses a definition of circular variance that in the limit of small + angles returns a number close to the 'linear' variance. + + """ + samples, ang = _circfuncs_common(samples, high, low) + res = np.mean(exp(1j*ang), axis=axis) + R = abs(res) + return ((high-low)/2.0/pi)**2 * 2 * log(1/R) + + +def circstd(samples, high=2*pi, low=0, axis=None): + """ + Compute the circular standard deviation for samples assumed to be in the + range [low to high]. + + Parameters + ---------- + samples : array_like + Input array. + low : float or int, optional + Low boundary for circular standard deviation range. Default is 0. + high : float or int, optional + High boundary for circular standard deviation range. + Default is ``2*pi``. + axis : int, optional + Axis along which standard deviations are computed. The default is + to compute the standard deviation of the flattened array. + + Returns + ------- + circstd : float + Circular standard deviation. + + Notes + ----- + This uses a definition of circular standard deviation that in the limit of + small angles returns a number close to the 'linear' standard deviation. + + """ + samples, ang = _circfuncs_common(samples, high, low) + res = np.mean(exp(1j*ang), axis=axis) + R = abs(res) + return ((high-low)/2.0/pi) * sqrt(-2*log(R)) + + +# Tests to include (from R) -- some of these already in stats. +######## +# X Ansari-Bradley +# X Bartlett (and Levene) +# X Binomial +# Y Pearson's Chi-squared (stats.chisquare) +# Y Association Between Paired samples (stats.pearsonr, stats.spearmanr) +# stats.kendalltau) -- these need work though +# Fisher's exact test +# X Fligner-Killeen Test +# Y Friedman Rank Sum (stats.friedmanchisquare?) +# Y Kruskal-Wallis +# Y Kolmogorov-Smirnov +# Cochran-Mantel-Haenszel Chi-Squared for Count +# McNemar's Chi-squared for Count +# X Mood Two-Sample +# X Test For Equal Means in One-Way Layout (see stats.ttest also) +# Pairwise Comparisons of proportions +# Pairwise t tests +# Tabulate p values for pairwise comparisons +# Pairwise Wilcoxon rank sum tests +# Power calculations two sample test of prop. +# Power calculations for one and two sample t tests +# Equal or Given Proportions +# Trend in Proportions +# Quade Test +# Y Student's T Test +# Y F Test to compare two variances +# XY Wilcoxon Rank Sum and Signed Rank Tests diff --git a/pywafo/src/wafo/stats/mstats.py b/pywafo/src/wafo/stats/mstats.py new file mode 100644 index 0000000..790c44d --- /dev/null +++ b/pywafo/src/wafo/stats/mstats.py @@ -0,0 +1,82 @@ +""" +=================================================================== +Statistical functions for masked arrays (:mod:`scipy.stats.mstats`) +=================================================================== + +.. currentmodule:: scipy.stats.mstats + +This module contains a large number of statistical functions that can +be used with masked arrays. + +Most of these functions are similar to those in scipy.stats but might +have small differences in the API or in the algorithm used. Since this +is a relatively new package, some API changes are still possible. + +.. autosummary:: + :toctree: generated/ + + argstoarray + betai + chisquare + count_tied_groups + describe + f_oneway + f_value_wilks_lambda + find_repeats + friedmanchisquare + gmean + hmean + kendalltau + kendalltau_seasonal + kruskalwallis + kruskalwallis + ks_twosamp + ks_twosamp + kurtosis + kurtosistest + linregress + mannwhitneyu + plotting_positions + mode + moment + mquantiles + msign + normaltest + obrientransform + pearsonr + plotting_positions + pointbiserialr + rankdata + scoreatpercentile + sem + signaltonoise + skew + skewtest + spearmanr + theilslopes + threshold + tmax + tmean + tmin + trim + trima + trimboth + trimmed_stde + trimr + trimtail + tsem + ttest_onesamp + ttest_ind + ttest_onesamp + ttest_rel + tvar + variation + winsorize + zmap + zscore + +""" +from __future__ import division, print_function, absolute_import + +from .mstats_basic import * +from .mstats_extras import * diff --git a/pywafo/src/wafo/stats/mstats_basic.py b/pywafo/src/wafo/stats/mstats_basic.py new file mode 100644 index 0000000..8ca98d9 --- /dev/null +++ b/pywafo/src/wafo/stats/mstats_basic.py @@ -0,0 +1,2068 @@ +""" +An extension of scipy.stats.stats to support masked arrays + +:author: Pierre GF Gerard-Marchant +:contact: pierregm_at_uga_edu +""" +# TODO : f_value_wilks_lambda looks botched... what are dfnum & dfden for ? +# TODO : ttest_reel looks botched: what are x1,x2,v1,v2 for ? +# TODO : reimplement ksonesamp + +from __future__ import division, print_function, absolute_import + +__author__ = "Pierre GF Gerard-Marchant" +__docformat__ = "restructuredtext en" + +__all__ = ['argstoarray', + 'betai', + 'chisquare','count_tied_groups', + 'describe', + 'f_oneway','f_value_wilks_lambda','find_repeats','friedmanchisquare', + 'gmean', + 'hmean', + 'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis', + 'ks_twosamp','ks_2samp','kurtosis','kurtosistest', + 'linregress', + 'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign', + 'normaltest', + 'obrientransform', + 'pearsonr','plotting_positions','pointbiserialr', + 'rankdata', + 'scoreatpercentile','sem', + 'sen_seasonal_slopes','signaltonoise','skew','skewtest','spearmanr', + 'theilslopes','threshold','tmax','tmean','tmin','trim','trimboth', + 'trimtail','trima','trimr','trimmed_mean','trimmed_std', + 'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp', + 'ttest_ind','ttest_rel','tvar', + 'variation', + 'winsorize', + 'zmap', 'zscore' + ] + +import numpy as np +from numpy import ndarray +import numpy.ma as ma +from numpy.ma import MaskedArray, masked, nomask + +from scipy.lib.six import iteritems + +import itertools +import warnings + +from . import stats +from . import distributions +import scipy.special as special +import scipy.misc as misc +from . import futil + + +genmissingvaldoc = """ +Notes +----- + Missing values are considered pair-wise: if a value is missing in x, + the corresponding value in y is masked. +""" +#------------------------------------------------------------------------------ + + +def _chk_asarray(a, axis): + # Always returns a masked array, raveled for axis=None + a = ma.asanyarray(a) + if axis is None: + a = ma.ravel(a) + outaxis = 0 + else: + outaxis = axis + return a, outaxis + + +def _chk2_asarray(a, b, axis): + a = ma.asanyarray(a) + b = ma.asanyarray(b) + if axis is None: + a = ma.ravel(a) + b = ma.ravel(b) + outaxis = 0 + else: + outaxis = axis + return a, b, outaxis + + +def _chk_size(a,b): + a = ma.asanyarray(a) + b = ma.asanyarray(b) + (na, nb) = (a.size, b.size) + if na != nb: + raise ValueError("The size of the input array should match!" + " (%s <> %s)" % (na,nb)) + return (a,b,na) + + +def argstoarray(*args): + """ + Constructs a 2D array from a group of sequences. + + Sequences are filled with missing values to match the length of the longest + sequence. + + Parameters + ---------- + args : sequences + Group of sequences. + + Returns + ------- + argstoarray : MaskedArray + A ( `m` x `n` ) masked array, where `m` is the number of arguments and + `n` the length of the longest argument. + + Notes + ----- + numpy.ma.row_stack has identical behavior, but is called with a sequence of + sequences. + + """ + if len(args) == 1 and not isinstance(args[0], ndarray): + output = ma.asarray(args[0]) + if output.ndim != 2: + raise ValueError("The input should be 2D") + else: + n = len(args) + m = max([len(k) for k in args]) + output = ma.array(np.empty((n,m), dtype=float), mask=True) + for (k,v) in enumerate(args): + output[k,:len(v)] = v + output[np.logical_not(np.isfinite(output._data))] = masked + return output + + +#####-------------------------------------------------------------------------- +#---- --- Ranking --- +#####-------------------------------------------------------------------------- + +def find_repeats(arr): + """Find repeats in arr and return a tuple (repeats, repeat_count). + Masked values are discarded. + +Parameters +---------- + arr : sequence + Input array. The array is flattened if it is not 1D. + +Returns +------- + repeats : ndarray + Array of repeated values. + counts : ndarray + Array of counts. + + """ + marr = ma.compressed(arr) + if not marr.size: + return (np.array(0), np.array(0)) + (v1, v2, n) = futil.dfreps(ma.array(ma.compressed(arr), copy=True)) + return (v1[:n], v2[:n]) + + +def count_tied_groups(x, use_missing=False): + """ + Counts the number of tied values. + + Parameters + ---------- + x : sequence + Sequence of data on which to counts the ties + use_missing : boolean + Whether to consider missing values as tied. + + Returns + ------- + count_tied_groups : dict + Returns a dictionary (nb of ties: nb of groups). + + Examples + -------- + >>> z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6] + >>> count_tied_groups(z) + >>> {2:1, 3:2} + >>> # The ties were 0 (3x), 2 (3x) and 3 (2x) + >>> z = ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6]) + >>> count_tied_groups(z) + >>> {2:2, 3:1} + >>> # The ties were 0 (2x), 2 (3x) and 3 (2x) + >>> z[[1,-1]] = masked + >>> count_tied_groups(z, use_missing=True) + >>> {2:2, 3:1} + >>> # The ties were 2 (3x), 3 (2x) and masked (2x) + + """ + nmasked = ma.getmask(x).sum() + # We need the copy as find_repeats will overwrite the initial data + data = ma.compressed(x).copy() + (ties, counts) = find_repeats(data) + nties = {} + if len(ties): + nties = dict(zip(np.unique(counts), itertools.repeat(1))) + nties.update(dict(zip(*find_repeats(counts)))) + if nmasked and use_missing: + try: + nties[nmasked] += 1 + except KeyError: + nties[nmasked] = 1 + return nties + + +def rankdata(data, axis=None, use_missing=False): + """Returns the rank (also known as order statistics) of each data point + along the given axis. + + If some values are tied, their rank is averaged. + If some values are masked, their rank is set to 0 if use_missing is False, + or set to the average rank of the unmasked values if use_missing is True. + + Parameters + ---------- + data : sequence + Input data. The data is transformed to a masked array + axis : {None,int}, optional + Axis along which to perform the ranking. + If None, the array is first flattened. An exception is raised if + the axis is specified for arrays with a dimension larger than 2 + use_missing : {boolean}, optional + Whether the masked values have a rank of 0 (False) or equal to the + average rank of the unmasked values (True). + """ + # + def _rank1d(data, use_missing=False): + n = data.count() + rk = np.empty(data.size, dtype=float) + idx = data.argsort() + rk[idx[:n]] = np.arange(1,n+1) + # + if use_missing: + rk[idx[n:]] = (n+1)/2. + else: + rk[idx[n:]] = 0 + # + repeats = find_repeats(data.copy()) + for r in repeats[0]: + condition = (data == r).filled(False) + rk[condition] = rk[condition].mean() + return rk + # + data = ma.array(data, copy=False) + if axis is None: + if data.ndim > 1: + return _rank1d(data.ravel(), use_missing).reshape(data.shape) + else: + return _rank1d(data, use_missing) + else: + return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray) + + +#####-------------------------------------------------------------------------- +#---- --- Central tendency --- +#####-------------------------------------------------------------------------- + +def gmean(a, axis=0): + a, axis = _chk_asarray(a, axis) + log_a = ma.log(a) + return ma.exp(log_a.mean(axis=axis)) +gmean.__doc__ = stats.gmean.__doc__ + + +def hmean(a, axis=0): + a, axis = _chk_asarray(a, axis) + if isinstance(a, MaskedArray): + size = a.count(axis) + else: + size = a.shape[axis] + return size / (1.0/a).sum(axis) +hmean.__doc__ = stats.hmean.__doc__ + + +def mode(a, axis=0): + a, axis = _chk_asarray(a, axis) + + def _mode1D(a): + (rep,cnt) = find_repeats(a) + if not cnt.ndim: + return (0, 0) + elif cnt.size: + return (rep[cnt.argmax()], cnt.max()) + else: + not_masked_indices = ma.flatnotmasked_edges(a) + first_not_masked_index = not_masked_indices[0] + return (a[first_not_masked_index], 1) + + if axis is None: + output = _mode1D(ma.ravel(a)) + output = (ma.array(output[0]), ma.array(output[1])) + else: + output = ma.apply_along_axis(_mode1D, axis, a) + newshape = list(a.shape) + newshape[axis] = 1 + slices = [slice(None)] * output.ndim + slices[axis] = 0 + modes = output[tuple(slices)].reshape(newshape) + slices[axis] = 1 + counts = output[tuple(slices)].reshape(newshape) + output = (modes, counts) + return output +mode.__doc__ = stats.mode.__doc__ + + +#####-------------------------------------------------------------------------- +#---- --- Probabilities --- +#####-------------------------------------------------------------------------- + +def betai(a, b, x): + x = np.asanyarray(x) + x = ma.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0 + return special.betainc(a, b, x) +betai.__doc__ = stats.betai.__doc__ + + +#####-------------------------------------------------------------------------- +#---- --- Correlation --- +#####-------------------------------------------------------------------------- + +def msign(x): + """Returns the sign of x, or 0 if x is masked.""" + return ma.filled(np.sign(x), 0) + + +def pearsonr(x,y): + """ + Calculates a Pearson correlation coefficient and the p-value for testing + non-correlation. + + The Pearson correlation coefficient measures the linear relationship + between two datasets. Strictly speaking, Pearson's correlation requires + that each dataset be normally distributed. Like other correlation + coefficients, this one varies between -1 and +1 with 0 implying no + correlation. Correlations of -1 or +1 imply an exact linear + relationship. Positive correlations imply that as `x` increases, so does + `y`. Negative correlations imply that as `x` increases, `y` decreases. + + The p-value roughly indicates the probability of an uncorrelated system + producing datasets that have a Pearson correlation at least as extreme + as the one computed from these datasets. The p-values are not entirely + reliable but are probably reasonable for datasets larger than 500 or so. + + Parameters + ---------- + x : 1-D array_like + Input + y : 1-D array_like + Input + + Returns + ------- + pearsonr : float + Pearson's correlation coefficient, 2-tailed p-value. + + References + ---------- + http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation + + """ + (x, y, n) = _chk_size(x, y) + (x, y) = (x.ravel(), y.ravel()) + # Get the common mask and the total nb of unmasked elements + m = ma.mask_or(ma.getmask(x), ma.getmask(y)) + n -= m.sum() + df = n-2 + if df < 0: + return (masked, masked) + # + (mx, my) = (x.mean(), y.mean()) + (xm, ym) = (x-mx, y-my) + # + r_num = ma.add.reduce(xm*ym) + r_den = ma.sqrt(ma.dot(xm,xm) * ma.dot(ym,ym)) + r = r_num / r_den + # Presumably, if r > 1, then it is only some small artifact of floating + # point arithmetic. + r = min(r, 1.0) + r = max(r, -1.0) + df = n - 2 + + if r is masked or abs(r) == 1.0: + prob = 0. + else: + t_squared = (df / ((1.0 - r) * (1.0 + r))) * r * r + prob = betai(0.5*df, 0.5, df/(df + t_squared)) + return r, prob + + +def spearmanr(x, y, use_ties=True): + """ + Calculates a Spearman rank-order correlation coefficient and the p-value + to test for non-correlation. + + The Spearman correlation is a nonparametric measure of the linear + relationship between two datasets. Unlike the Pearson correlation, the + Spearman correlation does not assume that both datasets are normally + distributed. Like other correlation coefficients, this one varies + between -1 and +1 with 0 implying no correlation. Correlations of -1 or + +1 imply an exact linear relationship. Positive correlations imply that + as `x` increases, so does `y`. Negative correlations imply that as `x` + increases, `y` decreases. + + Missing values are discarded pair-wise: if a value is missing in `x`, the + corresponding value in `y` is masked. + + The p-value roughly indicates the probability of an uncorrelated system + producing datasets that have a Spearman correlation at least as extreme + as the one computed from these datasets. The p-values are not entirely + reliable but are probably reasonable for datasets larger than 500 or so. + + Parameters + ---------- + x : array_like + The length of `x` must be > 2. + y : array_like + The length of `y` must be > 2. + use_ties : bool, optional + Whether the correction for ties should be computed. + + Returns + ------- + spearmanr : float + Spearman correlation coefficient, 2-tailed p-value. + + References + ---------- + [CRCProbStat2000] section 14.7 + + """ + (x, y, n) = _chk_size(x, y) + (x, y) = (x.ravel(), y.ravel()) + # + m = ma.mask_or(ma.getmask(x), ma.getmask(y)) + n -= m.sum() + if m is not nomask: + x = ma.array(x, mask=m, copy=True) + y = ma.array(y, mask=m, copy=True) + df = n-2 + if df < 0: + raise ValueError("The input must have at least 3 entries!") + # Gets the ranks and rank differences + rankx = rankdata(x) + ranky = rankdata(y) + dsq = np.add.reduce((rankx-ranky)**2) + # Tie correction + if use_ties: + xties = count_tied_groups(x) + yties = count_tied_groups(y) + corr_x = np.sum(v*k*(k**2-1) for (k,v) in iteritems(xties))/12. + corr_y = np.sum(v*k*(k**2-1) for (k,v) in iteritems(yties))/12. + else: + corr_x = corr_y = 0 + denom = n*(n**2 - 1)/6. + if corr_x != 0 or corr_y != 0: + rho = denom - dsq - corr_x - corr_y + rho /= ma.sqrt((denom-2*corr_x)*(denom-2*corr_y)) + else: + rho = 1. - dsq/denom + # + t = ma.sqrt(ma.divide(df,(rho+1.0)*(1.0-rho))) * rho + if t is masked: + prob = 0. + else: + prob = betai(0.5*df,0.5,df/(df+t*t)) + return rho, prob + + +def kendalltau(x, y, use_ties=True, use_missing=False): + """ + Computes Kendall's rank correlation tau on two variables *x* and *y*. + + Parameters + ---------- + xdata : sequence + First data list (for example, time). + ydata : sequence + Second data list. + use_ties : {True, False}, optional + Whether ties correction should be performed. + use_missing : {False, True}, optional + Whether missing data should be allocated a rank of 0 (False) or the + average rank (True) + + Returns + ------- + tau : float + Kendall tau + prob : float + Approximate 2-side p-value. + + """ + (x, y, n) = _chk_size(x, y) + (x, y) = (x.flatten(), y.flatten()) + m = ma.mask_or(ma.getmask(x), ma.getmask(y)) + if m is not nomask: + x = ma.array(x, mask=m, copy=True) + y = ma.array(y, mask=m, copy=True) + n -= m.sum() + # + if n < 2: + return (np.nan, np.nan) + # + rx = ma.masked_equal(rankdata(x, use_missing=use_missing), 0) + ry = ma.masked_equal(rankdata(y, use_missing=use_missing), 0) + idx = rx.argsort() + (rx, ry) = (rx[idx], ry[idx]) + C = np.sum([((ry[i+1:] > ry[i]) * (rx[i+1:] > rx[i])).filled(0).sum() + for i in range(len(ry)-1)], dtype=float) + D = np.sum([((ry[i+1:] < ry[i])*(rx[i+1:] > rx[i])).filled(0).sum() + for i in range(len(ry)-1)], dtype=float) + if use_ties: + xties = count_tied_groups(x) + yties = count_tied_groups(y) + corr_x = np.sum([v*k*(k-1) for (k,v) in iteritems(xties)], dtype=float) + corr_y = np.sum([v*k*(k-1) for (k,v) in iteritems(yties)], dtype=float) + denom = ma.sqrt((n*(n-1)-corr_x)/2. * (n*(n-1)-corr_y)/2.) + else: + denom = n*(n-1)/2. + tau = (C-D) / denom + # + var_s = n*(n-1)*(2*n+5) + if use_ties: + var_s -= np.sum(v*k*(k-1)*(2*k+5)*1. for (k,v) in iteritems(xties)) + var_s -= np.sum(v*k*(k-1)*(2*k+5)*1. for (k,v) in iteritems(yties)) + v1 = np.sum([v*k*(k-1) for (k, v) in iteritems(xties)], dtype=float) *\ + np.sum([v*k*(k-1) for (k, v) in iteritems(yties)], dtype=float) + v1 /= 2.*n*(n-1) + if n > 2: + v2 = np.sum([v*k*(k-1)*(k-2) for (k,v) in iteritems(xties)], + dtype=float) * \ + np.sum([v*k*(k-1)*(k-2) for (k,v) in iteritems(yties)], + dtype=float) + v2 /= 9.*n*(n-1)*(n-2) + else: + v2 = 0 + else: + v1 = v2 = 0 + var_s /= 18. + var_s += (v1 + v2) + z = (C-D)/np.sqrt(var_s) + prob = special.erfc(abs(z)/np.sqrt(2)) + return (tau, prob) + + +def kendalltau_seasonal(x): + """ + Computes a multivariate Kendall's rank correlation tau, for seasonal data. + + Parameters + ---------- + x : 2-D ndarray + Array of seasonal data, with seasons in columns. + + """ + x = ma.array(x, subok=True, copy=False, ndmin=2) + (n,m) = x.shape + n_p = x.count(0) + # + S_szn = np.sum(msign(x[i:]-x[i]).sum(0) for i in range(n)) + S_tot = S_szn.sum() + # + n_tot = x.count() + ties = count_tied_groups(x.compressed()) + corr_ties = np.sum(v*k*(k-1) for (k,v) in iteritems(ties)) + denom_tot = ma.sqrt(1.*n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2. + # + R = rankdata(x, axis=0, use_missing=True) + K = ma.empty((m,m), dtype=int) + covmat = ma.empty((m,m), dtype=float) +# cov_jj = ma.empty(m, dtype=float) + denom_szn = ma.empty(m, dtype=float) + for j in range(m): + ties_j = count_tied_groups(x[:,j].compressed()) + corr_j = np.sum(v*k*(k-1) for (k,v) in iteritems(ties_j)) + cmb = n_p[j]*(n_p[j]-1) + for k in range(j,m,1): + K[j,k] = np.sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum() + for i in range(n)) + covmat[j,k] = (K[j,k] + 4*(R[:,j]*R[:,k]).sum() - + n*(n_p[j]+1)*(n_p[k]+1))/3. + K[k,j] = K[j,k] + covmat[k,j] = covmat[j,k] +# cov_jj[j] = (nn_p*(2*n_p[j]+5)) +# cov_jj[j] -= np.sum(v*k*(k-1)*(2*k+5) for (k,v) in ties_j.iteritems()) +# cov_jj[j] /= 18. + denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2. + var_szn = covmat.diagonal() + # + z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn) + z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum()) + z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum()) + # + prob_szn = special.erfc(abs(z_szn)/np.sqrt(2)) + prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2)) + prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2)) + # + chi2_tot = (z_szn*z_szn).sum() + chi2_trd = m * z_szn.mean()**2 + output = {'seasonal tau': S_szn/denom_szn, + 'global tau': S_tot/denom_tot, + 'global tau (alt)': S_tot/denom_szn.sum(), + 'seasonal p-value': prob_szn, + 'global p-value (indep)': prob_tot_ind, + 'global p-value (dep)': prob_tot_dep, + 'chi2 total': chi2_tot, + 'chi2 trend': chi2_trd, + } + return output + + +def pointbiserialr(x, y): + x = ma.fix_invalid(x, copy=True).astype(bool) + y = ma.fix_invalid(y, copy=True).astype(float) + # Get rid of the missing data .......... + m = ma.mask_or(ma.getmask(x), ma.getmask(y)) + if m is not nomask: + unmask = np.logical_not(m) + x = x[unmask] + y = y[unmask] + # + n = len(x) + # phat is the fraction of x values that are True + phat = x.sum() / float(n) + y0 = y[~x] # y-values where x is False + y1 = y[x] # y-values where x is True + y0m = y0.mean() + y1m = y1.mean() + # + rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std() + # + df = n-2 + t = rpb*ma.sqrt(df/(1.0-rpb**2)) + prob = betai(0.5*df, 0.5, df/(df+t*t)) + return rpb, prob + +if stats.pointbiserialr.__doc__: + pointbiserialr.__doc__ = stats.pointbiserialr.__doc__ + genmissingvaldoc + + +def linregress(*args): + if len(args) == 1: # more than 1D array? + args = ma.array(args[0], copy=True) + if len(args) == 2: + x = args[0] + y = args[1] + else: + x = args[:,0] + y = args[:,1] + else: + x = ma.array(args[0]).flatten() + y = ma.array(args[1]).flatten() + m = ma.mask_or(ma.getmask(x), ma.getmask(y)) + if m is not nomask: + x = ma.array(x,mask=m) + y = ma.array(y,mask=m) + n = len(x) + (xmean, ymean) = (x.mean(), y.mean()) + (xm, ym) = (x-xmean, y-ymean) + (Sxx, Syy) = (ma.add.reduce(xm*xm), ma.add.reduce(ym*ym)) + Sxy = ma.add.reduce(xm*ym) + r_den = ma.sqrt(Sxx*Syy) + if r_den == 0.0: + r = 0.0 + else: + r = Sxy / r_den + if (r > 1.0): + r = 1.0 # from numerical error + # z = 0.5*log((1.0+r+TINY)/(1.0-r+TINY)) + df = n-2 + t = r * ma.sqrt(df/(1.0-r*r)) + prob = betai(0.5*df,0.5,df/(df+t*t)) + slope = Sxy / Sxx + intercept = ymean - slope*xmean + sterrest = ma.sqrt(1.-r*r) * y.std() + return slope, intercept, r, prob, sterrest + +if stats.linregress.__doc__: + linregress.__doc__ = stats.linregress.__doc__ + genmissingvaldoc + + +def theilslopes(y, x=None, alpha=0.05): + """ + Computes the Theil slope as the median of all slopes between paired values. + + Parameters + ---------- + y : array_like + Dependent variable. + x : {None, array_like}, optional + Independent variable. If None, use arange(len(y)) instead. + alpha : float + Confidence degree. + + Returns + ------- + medslope : float + Theil slope + medintercept : float + Intercept of the Theil line, as median(y)-medslope*median(x) + lo_slope : float + Lower bound of the confidence interval on medslope + up_slope : float + Upper bound of the confidence interval on medslope + + """ + y = ma.asarray(y).flatten() + y[-1] = masked + n = len(y) + if x is None: + x = ma.arange(len(y), dtype=float) + else: + x = ma.asarray(x).flatten() + if len(x) != n: + raise ValueError("Incompatible lengths ! (%s<>%s)" % (n,len(x))) + m = ma.mask_or(ma.getmask(x), ma.getmask(y)) + y._mask = x._mask = m + ny = y.count() + # + slopes = ma.hstack([(y[i+1:]-y[i])/(x[i+1:]-x[i]) for i in range(n-1)]) + slopes.sort() + medslope = ma.median(slopes) + medinter = ma.median(y) - medslope*ma.median(x) + # + if alpha > 0.5: + alpha = 1.-alpha + z = stats.distributions.norm.ppf(alpha/2.) + # + (xties, yties) = (count_tied_groups(x), count_tied_groups(y)) + nt = ny*(ny-1)/2. + sigsq = (ny*(ny-1)*(2*ny+5)/18.) + sigsq -= np.sum(v*k*(k-1)*(2*k+5) for (k,v) in iteritems(xties)) + sigsq -= np.sum(v*k*(k-1)*(2*k+5) for (k,v) in iteritems(yties)) + sigma = np.sqrt(sigsq) + + Ru = min(np.round((nt - z*sigma)/2. + 1), len(slopes)-1) + Rl = max(np.round((nt + z*sigma)/2.), 0) + delta = slopes[[Rl,Ru]] + return medslope, medinter, delta[0], delta[1] + + +def sen_seasonal_slopes(x): + x = ma.array(x, subok=True, copy=False, ndmin=2) + (n,_) = x.shape + # Get list of slopes per season + szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None] + for i in range(n)]) + szn_medslopes = ma.median(szn_slopes, axis=0) + medslope = ma.median(szn_slopes, axis=None) + return szn_medslopes, medslope + + +#####-------------------------------------------------------------------------- +#---- --- Inferential statistics --- +#####-------------------------------------------------------------------------- + +def ttest_1samp(a, popmean, axis=0): + a, axis = _chk_asarray(a, axis) + if a.size == 0: + return (np.nan, np.nan) + + x = a.mean(axis=axis) + v = a.var(axis=axis, ddof=1) + n = a.count(axis=axis) + df = n - 1. + svar = ((n - 1) * v) / df + t = (x - popmean) / ma.sqrt(svar / n) + prob = betai(0.5 * df, 0.5, df / (df + t*t)) + return t, prob +ttest_1samp.__doc__ = stats.ttest_1samp.__doc__ +ttest_onesamp = ttest_1samp + + +def ttest_ind(a, b, axis=0): + a, b, axis = _chk2_asarray(a, b, axis) + if a.size == 0 or b.size == 0: + return (np.nan, np.nan) + + (x1, x2) = (a.mean(axis), b.mean(axis)) + (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1)) + (n1, n2) = (a.count(axis), b.count(axis)) + df = n1 + n2 - 2. + svar = ((n1-1)*v1+(n2-1)*v2) / df + t = (x1-x2)/ma.sqrt(svar*(1.0/n1 + 1.0/n2)) # N-D COMPUTATION HERE!!!!!! + t = ma.filled(t, 1) # replace NaN t-values with 1.0 + probs = betai(0.5 * df, 0.5, df/(df + t*t)).reshape(t.shape) + return t, probs.squeeze() +ttest_ind.__doc__ = stats.ttest_ind.__doc__ + + +def ttest_rel(a, b, axis=0): + a, b, axis = _chk2_asarray(a, b, axis) + if len(a) != len(b): + raise ValueError('unequal length arrays') + + if a.size == 0 or b.size == 0: + return (np.nan, np.nan) + + (x1, x2) = (a.mean(axis), b.mean(axis)) + (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1)) + n = a.count(axis) + df = (n-1.0) + d = (a-b).astype('d') + denom = ma.sqrt((n*ma.add.reduce(d*d,axis) - ma.add.reduce(d,axis)**2) / df) + # zerodivproblem = denom == 0 + t = ma.add.reduce(d, axis) / denom + t = ma.filled(t, 1) + probs = betai(0.5*df,0.5,df/(df+t*t)).reshape(t.shape).squeeze() + return t, probs +ttest_rel.__doc__ = stats.ttest_rel.__doc__ + + +# stats.chisquare works with masked arrays, so we don't need to +# implement it here. +# For backwards compatibilty, stats.chisquare is included in +# the stats.mstats namespace. +chisquare = stats.chisquare + + +def mannwhitneyu(x,y, use_continuity=True): + """ + Computes the Mann-Whitney statistic + + Missing values in `x` and/or `y` are discarded. + + Parameters + ---------- + x : sequence + Input + y : sequence + Input + use_continuity : {True, False}, optional + Whether a continuity correction (1/2.) should be taken into account. + + Returns + ------- + u : float + The Mann-Whitney statistics + prob : float + Approximate p-value assuming a normal distribution. + + """ + x = ma.asarray(x).compressed().view(ndarray) + y = ma.asarray(y).compressed().view(ndarray) + ranks = rankdata(np.concatenate([x,y])) + (nx, ny) = (len(x), len(y)) + nt = nx + ny + U = ranks[:nx].sum() - nx*(nx+1)/2. + U = max(U, nx*ny - U) + u = nx*ny - U + # + mu = (nx*ny)/2. + sigsq = (nt**3 - nt)/12. + ties = count_tied_groups(ranks) + sigsq -= np.sum(v*(k**3-k) for (k,v) in iteritems(ties))/12. + sigsq *= nx*ny/float(nt*(nt-1)) + # + if use_continuity: + z = (U - 1/2. - mu) / ma.sqrt(sigsq) + else: + z = (U - mu) / ma.sqrt(sigsq) + prob = special.erfc(abs(z)/np.sqrt(2)) + return (u, prob) + + +def kruskalwallis(*args): + output = argstoarray(*args) + ranks = ma.masked_equal(rankdata(output, use_missing=False), 0) + sumrk = ranks.sum(-1) + ngrp = ranks.count(-1) + ntot = ranks.count() +# ssbg = (sumrk**2/ranks.count(-1)).sum() - ranks.sum()**2/ntotal +# H = ssbg / (ntotal*(ntotal+1)/12.) + H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1) + # Tie correction + ties = count_tied_groups(ranks) + T = 1. - np.sum(v*(k**3-k) for (k,v) in iteritems(ties))/float(ntot**3-ntot) + if T == 0: + raise ValueError('All numbers are identical in kruskal') + H /= T + # + df = len(output) - 1 + prob = stats.chisqprob(H,df) + return (H, prob) +kruskal = kruskalwallis +kruskalwallis.__doc__ = stats.kruskal.__doc__ + + +_kolmog2 = special.kolmogorov + + +def _kolmog1(x,n): + if x <= 0: + return 0 + if x >= 1: + return 1 + j = np.arange(np.floor(n*(1-x))+1) + return 1 - x * np.sum(np.exp(np.log(misc.comb(n,j)) + + (n-j) * np.log(1-x-j/float(n)) + + (j-1) * np.log(x+j/float(n)))) + + +def ks_twosamp(data1, data2, alternative="two-sided"): + """ + Computes the Kolmogorov-Smirnov test on two samples. + + Missing values are discarded. + + Parameters + ---------- + data1 : array_like + First data set + data2 : array_like + Second data set + alternative : {'two-sided', 'less', 'greater'}, optional + Indicates the alternative hypothesis. Default is 'two-sided'. + + Returns + ------- + d : float + Value of the Kolmogorov Smirnov test + p : float + Corresponding p-value. + + """ + (data1, data2) = (ma.asarray(data1), ma.asarray(data2)) + (n1, n2) = (data1.count(), data2.count()) + n = (n1*n2/float(n1+n2)) + mix = ma.concatenate((data1.compressed(), data2.compressed())) + mixsort = mix.argsort(kind='mergesort') + csum = np.where(mixsort < n1, 1./n1, -1./n2).cumsum() + # Check for ties + if len(np.unique(mix)) < (n1+n2): + csum = csum[np.r_[np.diff(mix[mixsort]).nonzero()[0],-1]] + # + alternative = str(alternative).lower()[0] + if alternative == 't': + d = ma.abs(csum).max() + prob = _kolmog2(np.sqrt(n)*d) + elif alternative == 'l': + d = -csum.min() + prob = np.exp(-2*n*d**2) + elif alternative == 'g': + d = csum.max() + prob = np.exp(-2*n*d**2) + else: + raise ValueError("Invalid value for the alternative hypothesis: " + "should be in 'two-sided', 'less' or 'greater'") + return (d, prob) +ks_2samp = ks_twosamp + + +def ks_twosamp_old(data1, data2): + """ Computes the Kolmogorov-Smirnov statistic on 2 samples. + + Returns + ------- + KS D-value, p-value + + """ + (data1, data2) = [ma.asarray(d).compressed() for d in (data1,data2)] + return stats.ks_2samp(data1,data2) + + +#####-------------------------------------------------------------------------- +#---- --- Trimming --- +#####-------------------------------------------------------------------------- + +def threshold(a, threshmin=None, threshmax=None, newval=0): + """ + Clip array to a given value. + + Similar to numpy.clip(), except that values less than `threshmin` or + greater than `threshmax` are replaced by `newval`, instead of by + `threshmin` and `threshmax` respectively. + + Parameters + ---------- + a : ndarray + Input data + threshmin : {None, float}, optional + Lower threshold. If None, set to the minimum value. + threshmax : {None, float}, optional + Upper threshold. If None, set to the maximum value. + newval : {0, float}, optional + Value outside the thresholds. + + Returns + ------- + threshold : ndarray + Returns `a`, with values less then `threshmin` and values greater + `threshmax` replaced with `newval`. + + """ + a = ma.array(a, copy=True) + mask = np.zeros(a.shape, dtype=bool) + if threshmin is not None: + mask |= (a < threshmin).filled(False) + if threshmax is not None: + mask |= (a > threshmax).filled(False) + a[mask] = newval + return a + + +def trima(a, limits=None, inclusive=(True,True)): + """Trims an array by masking the data outside some given limits. + Returns a masked version of the input array. + + Parameters + ---------- + a : sequence + Input array. + limits : {None, tuple}, optional + Tuple of (lower limit, upper limit) in absolute values. + Values of the input array lower (greater) than the lower (upper) limit + will be masked. A limit is None indicates an open interval. + inclusive : {(True,True) tuple}, optional + Tuple of (lower flag, upper flag), indicating whether values exactly + equal to the lower (upper) limit are allowed. + + """ + a = ma.asarray(a) + a.unshare_mask() + if limits is None: + return a + (lower_lim, upper_lim) = limits + (lower_in, upper_in) = inclusive + condition = False + if lower_lim is not None: + if lower_in: + condition |= (a < lower_lim) + else: + condition |= (a <= lower_lim) + if upper_lim is not None: + if upper_in: + condition |= (a > upper_lim) + else: + condition |= (a >= upper_lim) + a[condition.filled(True)] = masked + return a + + +def trimr(a, limits=None, inclusive=(True, True), axis=None): + """ + Trims an array by masking some proportion of the data on each end. + Returns a masked version of the input array. + + Parameters + ---------- + a : sequence + Input array. + limits : {None, tuple}, optional + Tuple of the percentages to cut on each side of the array, with respect + to the number of unmasked data, as floats between 0. and 1. + Noting n the number of unmasked data before trimming, the + (n*limits[0])th smallest data and the (n*limits[1])th largest data are + masked, and the total number of unmasked data after trimming is + n*(1.-sum(limits)). The value of one limit can be set to None to + indicate an open interval. + inclusive : {(True,True) tuple}, optional + Tuple of flags indicating whether the number of data being masked on + the left (right) end should be truncated (True) or rounded (False) to + integers. + axis : {None,int}, optional + Axis along which to trim. If None, the whole array is trimmed, but its + shape is maintained. + + """ + def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive): + n = a.count() + idx = a.argsort() + if low_limit: + if low_inclusive: + lowidx = int(low_limit*n) + else: + lowidx = np.round(low_limit*n) + a[idx[:lowidx]] = masked + if up_limit is not None: + if up_inclusive: + upidx = n - int(n*up_limit) + else: + upidx = n - np.round(n*up_limit) + a[idx[upidx:]] = masked + return a + # + a = ma.asarray(a) + a.unshare_mask() + if limits is None: + return a + # Check the limits + (lolim, uplim) = limits + errmsg = "The proportion to cut from the %s should be between 0. and 1." + if lolim is not None: + if lolim > 1. or lolim < 0: + raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim) + if uplim is not None: + if uplim > 1. or uplim < 0: + raise ValueError(errmsg % 'end' + "(got %s)" % uplim) + # + (loinc, upinc) = inclusive + # + if axis is None: + shp = a.shape + return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp) + else: + return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc) + +trimdoc = """ + Parameters + ---------- + a : sequence + Input array + limits : {None, tuple}, optional + If `relative` is False, tuple (lower limit, upper limit) in absolute values. + Values of the input array lower (greater) than the lower (upper) limit are + masked. + + If `relative` is True, tuple (lower percentage, upper percentage) to cut + on each side of the array, with respect to the number of unmasked data. + + Noting n the number of unmasked data before trimming, the (n*limits[0])th + smallest data and the (n*limits[1])th largest data are masked, and the + total number of unmasked data after trimming is n*(1.-sum(limits)) + In each case, the value of one limit can be set to None to indicate an + open interval. + + If limits is None, no trimming is performed + inclusive : {(bool, bool) tuple}, optional + If `relative` is False, tuple indicating whether values exactly equal + to the absolute limits are allowed. + If `relative` is True, tuple indicating whether the number of data + being masked on each side should be rounded (True) or truncated + (False). + relative : bool, optional + Whether to consider the limits as absolute values (False) or proportions + to cut (True). + axis : int, optional + Axis along which to trim. +""" + + +def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None): + """ + Trims an array by masking the data outside some given limits. + + Returns a masked version of the input array. + + %s + + Examples + -------- + >>> z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10] + >>> trim(z,(3,8)) + [--,--, 3, 4, 5, 6, 7, 8,--,--] + >>> trim(z,(0.1,0.2),relative=True) + [--, 2, 3, 4, 5, 6, 7, 8,--,--] + + """ + if relative: + return trimr(a, limits=limits, inclusive=inclusive, axis=axis) + else: + return trima(a, limits=limits, inclusive=inclusive) + +if trim.__doc__ is not None: + trim.__doc__ = trim.__doc__ % trimdoc + + +def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None): + """ + Trims the smallest and largest data values. + + Trims the `data` by masking the ``int(proportiontocut * n)`` smallest and + ``int(proportiontocut * n)`` largest values of data along the given axis, + where n is the number of unmasked values before trimming. + + Parameters + ---------- + data : ndarray + Data to trim. + proportiontocut : float, optional + Percentage of trimming (as a float between 0 and 1). + If n is the number of unmasked values before trimming, the number of + values after trimming is ``(1 - 2*proportiontocut) * n``. + Default is 0.2. + inclusive : {(bool, bool) tuple}, optional + Tuple indicating whether the number of data being masked on each side + should be rounded (True) or truncated (False). + axis : int, optional + Axis along which to perform the trimming. + If None, the input array is first flattened. + + """ + return trimr(data, limits=(proportiontocut,proportiontocut), + inclusive=inclusive, axis=axis) + +#.............................................................................. + + +def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True), + axis=None): + """ + Trims the data by masking values from one tail. + + Parameters + ---------- + data : array_like + Data to trim. + proportiontocut : float, optional + Percentage of trimming. If n is the number of unmasked values + before trimming, the number of values after trimming is + ``(1 - proportiontocut) * n``. Default is 0.2. + tail : {'left','right'}, optional + If 'left' the `proportiontocut` lowest values will be masked. + If 'right' the `proportiontocut` highest values will be masked. + Default is 'left'. + inclusive : {(bool, bool) tuple}, optional + Tuple indicating whether the number of data being masked on each side + should be rounded (True) or truncated (False). Default is + (True, True). + axis : int, optional + Axis along which to perform the trimming. + If None, the input array is first flattened. Default is None. + + Returns + ------- + trimtail : ndarray + Returned array of same shape as `data` with masked tail values. + + """ + tail = str(tail).lower()[0] + if tail == 'l': + limits = (proportiontocut,None) + elif tail == 'r': + limits = (None, proportiontocut) + else: + raise TypeError("The tail argument should be in ('left','right')") + return trimr(data, limits=limits, axis=axis, inclusive=inclusive) + +trim1 = trimtail + + +def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, + axis=None): + """Returns the trimmed mean of the data along the given axis. + + %s + + """ % trimdoc + if (not isinstance(limits,tuple)) and isinstance(limits,float): + limits = (limits, limits) + if relative: + return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis) + else: + return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis) + + +def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, + axis=None, ddof=0): + """Returns the trimmed variance of the data along the given axis. + + %s + ddof : {0,integer}, optional + Means Delta Degrees of Freedom. The denominator used during computations + is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un- + biased estimate of the variance. + + """ % trimdoc + if (not isinstance(limits,tuple)) and isinstance(limits,float): + limits = (limits, limits) + if relative: + out = trimr(a,limits=limits, inclusive=inclusive,axis=axis) + else: + out = trima(a,limits=limits,inclusive=inclusive) + return out.var(axis=axis, ddof=ddof) + + +def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, + axis=None, ddof=0): + """Returns the trimmed standard deviation of the data along the given axis. + + %s + ddof : {0,integer}, optional + Means Delta Degrees of Freedom. The denominator used during computations + is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un- + biased estimate of the variance. + + """ % trimdoc + if (not isinstance(limits,tuple)) and isinstance(limits,float): + limits = (limits, limits) + if relative: + out = trimr(a,limits=limits,inclusive=inclusive,axis=axis) + else: + out = trima(a,limits=limits,inclusive=inclusive) + return out.std(axis=axis,ddof=ddof) + + +def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None): + """ + Returns the standard error of the trimmed mean along the given axis. + + Parameters + ---------- + a : sequence + Input array + limits : {(0.1,0.1), tuple of float}, optional + tuple (lower percentage, upper percentage) to cut on each side of the + array, with respect to the number of unmasked data. + + If n is the number of unmasked data before trimming, the values + smaller than ``n * limits[0]`` and the values larger than + ``n * `limits[1]`` are masked, and the total number of unmasked + data after trimming is ``n * (1.-sum(limits))``. In each case, + the value of one limit can be set to None to indicate an open interval. + If `limits` is None, no trimming is performed. + inclusive : {(bool, bool) tuple} optional + Tuple indicating whether the number of data being masked on each side + should be rounded (True) or truncated (False). + axis : int, optional + Axis along which to trim. + + Returns + ------- + trimmed_stde : scalar or ndarray + + """ + #........................ + def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive): + "Returns the standard error of the trimmed mean for a 1D input data." + n = a.count() + idx = a.argsort() + if low_limit: + if low_inclusive: + lowidx = int(low_limit*n) + else: + lowidx = np.round(low_limit*n) + a[idx[:lowidx]] = masked + if up_limit is not None: + if up_inclusive: + upidx = n - int(n*up_limit) + else: + upidx = n - np.round(n*up_limit) + a[idx[upidx:]] = masked + a[idx[:lowidx]] = a[idx[lowidx]] + a[idx[upidx:]] = a[idx[upidx-1]] + winstd = a.std(ddof=1) + return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a))) + #........................ + a = ma.array(a, copy=True, subok=True) + a.unshare_mask() + if limits is None: + return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis)) + if (not isinstance(limits,tuple)) and isinstance(limits,float): + limits = (limits, limits) + # Check the limits + (lolim, uplim) = limits + errmsg = "The proportion to cut from the %s should be between 0. and 1." + if lolim is not None: + if lolim > 1. or lolim < 0: + raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim) + if uplim is not None: + if uplim > 1. or uplim < 0: + raise ValueError(errmsg % 'end' + "(got %s)" % uplim) + # + (loinc, upinc) = inclusive + if (axis is None): + return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc) + else: + if a.ndim > 2: + raise ValueError("Array 'a' must be at most two dimensional, but got a.ndim = %d" % a.ndim) + return ma.apply_along_axis(_trimmed_stde_1D, axis, a, + lolim,uplim,loinc,upinc) + + +def tmean(a, limits=None, inclusive=(True,True)): + return trima(a, limits=limits, inclusive=inclusive).mean() +tmean.__doc__ = stats.tmean.__doc__ + + +def tvar(a, limits=None, inclusive=(True,True)): + return trima(a, limits=limits, inclusive=inclusive).var() +tvar.__doc__ = stats.tvar.__doc__ + + +def tmin(a, lowerlimit=None, axis=0, inclusive=True): + a, axis = _chk_asarray(a, axis) + am = trima(a, (lowerlimit, None), (inclusive, False)) + return ma.minimum.reduce(am, axis) +tmin.__doc__ = stats.tmin.__doc__ + + +def tmax(a, upperlimit, axis=0, inclusive=True): + a, axis = _chk_asarray(a, axis) + am = trima(a, (None, upperlimit), (False, inclusive)) + return ma.maximum.reduce(am, axis) +tmax.__doc__ = stats.tmax.__doc__ + + +def tsem(a, limits=None, inclusive=(True,True)): + a = ma.asarray(a).ravel() + if limits is None: + n = float(a.count()) + return a.std()/ma.sqrt(n) + am = trima(a.ravel(), limits, inclusive) + sd = np.sqrt(am.var()) + return sd / am.count() +tsem.__doc__ = stats.tsem.__doc__ + + +def winsorize(a, limits=None, inclusive=(True,True), inplace=False, axis=None): + """ + Returns a Winsorized version of the input array. + + The (limits[0])th lowest values are set to the (limits[0])th percentile, + and the (limits[1])th highest values are set to the (limits[1])th + percentile. + Masked values are skipped. + + + Parameters + ---------- + a : sequence + Input array. + limits : {None, tuple of float}, optional + Tuple of the percentages to cut on each side of the array, with respect + to the number of unmasked data, as floats between 0. and 1. + Noting n the number of unmasked data before trimming, the + (n*limits[0])th smallest data and the (n*limits[1])th largest data are + masked, and the total number of unmasked data after trimming + is n*(1.-sum(limits)) The value of one limit can be set to None to + indicate an open interval. + inclusive : {(True, True) tuple}, optional + Tuple indicating whether the number of data being masked on each side + should be rounded (True) or truncated (False). + inplace : {False, True}, optional + Whether to winsorize in place (True) or to use a copy (False) + axis : {None, int}, optional + Axis along which to trim. If None, the whole array is trimmed, but its + shape is maintained. + + """ + def _winsorize1D(a, low_limit, up_limit, low_include, up_include): + n = a.count() + idx = a.argsort() + if low_limit: + if low_include: + lowidx = int(low_limit*n) + else: + lowidx = np.round(low_limit*n) + a[idx[:lowidx]] = a[idx[lowidx]] + if up_limit is not None: + if up_include: + upidx = n - int(n*up_limit) + else: + upidx = n - np.round(n*up_limit) + a[idx[upidx:]] = a[idx[upidx-1]] + return a + # We gonna modify a: better make a copy + a = ma.array(a, copy=np.logical_not(inplace)) + # + if limits is None: + return a + if (not isinstance(limits,tuple)) and isinstance(limits,float): + limits = (limits, limits) + # Check the limits + (lolim, uplim) = limits + errmsg = "The proportion to cut from the %s should be between 0. and 1." + if lolim is not None: + if lolim > 1. or lolim < 0: + raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim) + if uplim is not None: + if uplim > 1. or uplim < 0: + raise ValueError(errmsg % 'end' + "(got %s)" % uplim) + # + (loinc, upinc) = inclusive + # + if axis is None: + shp = a.shape + return _winsorize1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp) + else: + return ma.apply_along_axis(_winsorize1D, axis,a,lolim,uplim,loinc,upinc) + + +#####-------------------------------------------------------------------------- +#---- --- Moments --- +#####-------------------------------------------------------------------------- + +def moment(a, moment=1, axis=0): + a, axis = _chk_asarray(a, axis) + if moment == 1: + # By definition the first moment about the mean is 0. + shape = list(a.shape) + del shape[axis] + if shape: + # return an actual array of the appropriate shape + return np.zeros(shape, dtype=float) + else: + # the input was 1D, so return a scalar instead of a rank-0 array + return np.float64(0.0) + else: + mn = ma.expand_dims(a.mean(axis=axis), axis) + s = ma.power((a-mn), moment) + return s.mean(axis=axis) +moment.__doc__ = stats.moment.__doc__ + + +def variation(a, axis=0): + a, axis = _chk_asarray(a, axis) + return a.std(axis)/a.mean(axis) +variation.__doc__ = stats.variation.__doc__ + + +def skew(a, axis=0, bias=True): + a, axis = _chk_asarray(a,axis) + n = a.count(axis) + m2 = moment(a, 2, axis) + m3 = moment(a, 3, axis) + olderr = np.seterr(all='ignore') + try: + vals = ma.where(m2 == 0, 0, m3 / m2**1.5) + finally: + np.seterr(**olderr) + + if not bias: + can_correct = (n > 2) & (m2 > 0) + if can_correct.any(): + m2 = np.extract(can_correct, m2) + m3 = np.extract(can_correct, m3) + nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5 + np.place(vals, can_correct, nval) + return vals +skew.__doc__ = stats.skew.__doc__ + + +def kurtosis(a, axis=0, fisher=True, bias=True): + a, axis = _chk_asarray(a, axis) + m2 = moment(a,2,axis) + m4 = moment(a,4,axis) + olderr = np.seterr(all='ignore') + try: + vals = ma.where(m2 == 0, 0, m4 / m2**2.0) + finally: + np.seterr(**olderr) + + if not bias: + n = a.count(axis) + can_correct = (n > 3) & (m2 is not ma.masked and m2 > 0) + if can_correct.any(): + n = np.extract(can_correct, n) + m2 = np.extract(can_correct, m2) + m4 = np.extract(can_correct, m4) + nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0) + np.place(vals, can_correct, nval+3.0) + if fisher: + return vals - 3 + else: + return vals +kurtosis.__doc__ = stats.kurtosis.__doc__ + + +def describe(a, axis=0): + """ + Computes several descriptive statistics of the passed array. + + Parameters + ---------- + a : array + + axis : int or None + + Returns + ------- + n : int + (size of the data (discarding missing values) + mm : (int, int) + min, max + + arithmetic mean : float + + unbiased variance : float + + biased skewness : float + + biased kurtosis : float + + Examples + -------- + + >>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1]) + >>> describe(ma) + (array(3), + (0, 2), + 1.0, + 1.0, + masked_array(data = 0.0, + mask = False, + fill_value = 1e+20) + , + -1.5) + + """ + a, axis = _chk_asarray(a, axis) + n = a.count(axis) + mm = (ma.minimum.reduce(a), ma.maximum.reduce(a)) + m = a.mean(axis) + v = a.var(axis) + sk = skew(a, axis) + kurt = kurtosis(a, axis) + return n, mm, m, v, sk, kurt + +#............................................................................. + + +def stde_median(data, axis=None): + """Returns the McKean-Schrader estimate of the standard error of the sample +median along the given axis. masked values are discarded. + + Parameters + ---------- + data : ndarray + Data to trim. + axis : {None,int}, optional + Axis along which to perform the trimming. + If None, the input array is first flattened. + + """ + def _stdemed_1D(data): + data = np.sort(data.compressed()) + n = len(data) + z = 2.5758293035489004 + k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0)) + return ((data[n-k] - data[k-1])/(2.*z)) + # + data = ma.array(data, copy=False, subok=True) + if (axis is None): + return _stdemed_1D(data) + else: + if data.ndim > 2: + raise ValueError("Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim) + return ma.apply_along_axis(_stdemed_1D, axis, data) + +#####-------------------------------------------------------------------------- +#---- --- Normality Tests --- +#####-------------------------------------------------------------------------- + + +def skewtest(a, axis=0): + a, axis = _chk_asarray(a, axis) + if axis is None: + a = a.ravel() + axis = 0 + b2 = skew(a,axis) + n = a.count(axis) + if np.min(n) < 8: + raise ValueError( + "skewtest is not valid with less than 8 samples; %i samples" + " were given." % np.min(n)) + y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2))) + beta2 = (3.0*(n*n+27*n-70)*(n+1)*(n+3)) / ((n-2.0)*(n+5)*(n+7)*(n+9)) + W2 = -1 + ma.sqrt(2*(beta2-1)) + delta = 1/ma.sqrt(0.5*ma.log(W2)) + alpha = ma.sqrt(2.0/(W2-1)) + y = ma.where(y == 0, 1, y) + Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1)) + return Z, (1.0 - stats.zprob(Z))*2 +skewtest.__doc__ = stats.skewtest.__doc__ + + +def kurtosistest(a, axis=0): + a, axis = _chk_asarray(a, axis) + n = a.count(axis=axis) + if np.min(n) < 5: + raise ValueError( + "kurtosistest requires at least 5 observations; %i observations" + " were given." % np.min(n)) + if np.min(n) < 20: + warnings.warn( + "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" % + np.min(n)) + b2 = kurtosis(a, axis, fisher=False) + E = 3.0*(n-1) / (n+1) + varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1)*(n+3)*(n+5)) + x = (b2-E)/ma.sqrt(varb2) + sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) / + (n*(n-2)*(n-3))) + A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2))) + term1 = 1 - 2./(9.0*A) + denom = 1 + x*ma.sqrt(2/(A-4.0)) + if np.ma.isMaskedArray(denom): + # For multi-dimensional array input + denom[denom < 0] = masked + elif denom < 0: + denom = masked + + term2 = ma.power((1-2.0/A)/denom,1/3.0) + Z = (term1 - term2) / np.sqrt(2/(9.0*A)) + return Z, 2 * distributions.norm.sf(np.abs(Z)) +kurtosistest.__doc__ = stats.kurtosistest.__doc__ + + +def normaltest(a, axis=0): + a, axis = _chk_asarray(a, axis) + s,_ = skewtest(a,axis) + k,_ = kurtosistest(a,axis) + k2 = s*s + k*k + return k2, stats.chisqprob(k2,2) +normaltest.__doc__ = stats.normaltest.__doc__ + +# Martinez-Iglewicz test +# K-S test + + +#####-------------------------------------------------------------------------- +#---- --- Percentiles --- +#####-------------------------------------------------------------------------- + + +def mquantiles(a, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None, + limit=()): + """ + Computes empirical quantiles for a data array. + + Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``, + where ``x[j]`` is the j-th order statistic, and gamma is a function of + ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and + ``g = n*p + m - j``. + + Reinterpreting the above equations to compare to **R** lead to the + equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)`` + + Typical values of (alphap,betap) are: + - (0,1) : ``p(k) = k/n`` : linear interpolation of cdf + (**R** type 4) + - (.5,.5) : ``p(k) = (k - 1/2.)/n`` : piecewise linear function + (**R** type 5) + - (0,0) : ``p(k) = k/(n+1)`` : + (**R** type 6) + - (1,1) : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])]. + (**R** type 7, **R** default) + - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])]. + The resulting quantile estimates are approximately median-unbiased + regardless of the distribution of x. + (**R** type 8) + - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom. + The resulting quantile estimates are approximately unbiased + if x is normally distributed + (**R** type 9) + - (.4,.4) : approximately quantile unbiased (Cunnane) + - (.35,.35): APL, used with PWM + + Parameters + ---------- + a : array_like + Input data, as a sequence or array of dimension at most 2. + prob : array_like, optional + List of quantiles to compute. + alphap : float, optional + Plotting positions parameter, default is 0.4. + betap : float, optional + Plotting positions parameter, default is 0.4. + axis : int, optional + Axis along which to perform the trimming. + If None (default), the input array is first flattened. + limit : tuple + Tuple of (lower, upper) values. + Values of `a` outside this open interval are ignored. + + Returns + ------- + mquantiles : MaskedArray + An array containing the calculated quantiles. + + Notes + ----- + This formulation is very similar to **R** except the calculation of + ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined + with each type. + + References + ---------- + .. [1] *R* statistical software at http://www.r-project.org/ + + Examples + -------- + >>> from scipy.stats.mstats import mquantiles + >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.]) + >>> mquantiles(a) + array([ 19.2, 40. , 42.8]) + + Using a 2D array, specifying axis and limit. + + >>> data = np.array([[ 6., 7., 1.], + [ 47., 15., 2.], + [ 49., 36., 3.], + [ 15., 39., 4.], + [ 42., 40., -999.], + [ 41., 41., -999.], + [ 7., -999., -999.], + [ 39., -999., -999.], + [ 43., -999., -999.], + [ 40., -999., -999.], + [ 36., -999., -999.]]) + >>> mquantiles(data, axis=0, limit=(0, 50)) + array([[ 19.2 , 14.6 , 1.45], + [ 40. , 37.5 , 2.5 ], + [ 42.8 , 40.05, 3.55]]) + + >>> data[:, 2] = -999. + >>> mquantiles(data, axis=0, limit=(0, 50)) + masked_array(data = + [[19.2 14.6 --] + [40.0 37.5 --] + [42.8 40.05 --]], + mask = + [[False False True] + [False False True] + [False False True]], + fill_value = 1e+20) + + """ + def _quantiles1D(data,m,p): + x = np.sort(data.compressed()) + n = len(x) + if n == 0: + return ma.array(np.empty(len(p), dtype=float), mask=True) + elif n == 1: + return ma.array(np.resize(x, p.shape), mask=nomask) + aleph = (n*p + m) + k = np.floor(aleph.clip(1, n-1)).astype(int) + gamma = (aleph-k).clip(0,1) + return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()] + + # Initialization & checks --------- + data = ma.array(a, copy=False) + if data.ndim > 2: + raise TypeError("Array should be 2D at most !") + # + if limit: + condition = (limit[0] < data) & (data < limit[1]) + data[~condition.filled(True)] = masked + # + p = np.array(prob, copy=False, ndmin=1) + m = alphap + p*(1.-alphap-betap) + # Computes quantiles along axis (or globally) + if (axis is None): + return _quantiles1D(data, m, p) + return ma.apply_along_axis(_quantiles1D, axis, data, m, p) + + +def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4): + """Calculate the score at the given 'per' percentile of the + sequence a. For example, the score at per=50 is the median. + + This function is a shortcut to mquantile + + """ + if (per < 0) or (per > 100.): + raise ValueError("The percentile should be between 0. and 100. !" + " (got %s)" % per) + return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap, + limit=limit, axis=0).squeeze() + + +def plotting_positions(data, alpha=0.4, beta=0.4): + """ + Returns plotting positions (or empirical percentile points) for the data. + + Plotting positions are defined as ``(i-alpha)/(n+1-alpha-beta)``, where: + - i is the rank order statistics + - n is the number of unmasked values along the given axis + - `alpha` and `beta` are two parameters. + + Typical values for `alpha` and `beta` are: + - (0,1) : ``p(k) = k/n``, linear interpolation of cdf (R, type 4) + - (.5,.5) : ``p(k) = (k-1/2.)/n``, piecewise linear function + (R, type 5) + - (0,0) : ``p(k) = k/(n+1)``, Weibull (R type 6) + - (1,1) : ``p(k) = (k-1)/(n-1)``, in this case, + ``p(k) = mode[F(x[k])]``. That's R default (R type 7) + - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``, then + ``p(k) ~ median[F(x[k])]``. + The resulting quantile estimates are approximately median-unbiased + regardless of the distribution of x. (R type 8) + - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``, Blom. + The resulting quantile estimates are approximately unbiased + if x is normally distributed (R type 9) + - (.4,.4) : approximately quantile unbiased (Cunnane) + - (.35,.35): APL, used with PWM + - (.3175, .3175): used in scipy.stats.probplot + + Parameters + ---------- + data : array_like + Input data, as a sequence or array of dimension at most 2. + alpha : float, optional + Plotting positions parameter. Default is 0.4. + beta : float, optional + Plotting positions parameter. Default is 0.4. + + Returns + ------- + positions : MaskedArray + The calculated plotting positions. + + """ + data = ma.array(data, copy=False).reshape(1,-1) + n = data.count() + plpos = np.empty(data.size, dtype=float) + plpos[n:] = 0 + plpos[data.argsort()[:n]] = (np.arange(1, n+1) - alpha) / \ + (n + 1.0 - alpha - beta) + return ma.array(plpos, mask=data._mask) + +meppf = plotting_positions + +#####-------------------------------------------------------------------------- +#---- --- Variability --- +#####-------------------------------------------------------------------------- + + +def obrientransform(*args): + """ +Computes a transform on input data (any number of columns). Used to +test for homogeneity of variance prior to running one-way stats. Each +array in *args is one level of a factor. If an F_oneway() run on the +transformed data and found significant, variances are unequal. From +Maxwell and Delaney, p.112. + +Returns: transformed data for use in an ANOVA + """ + data = argstoarray(*args).T + v = data.var(axis=0,ddof=1) + m = data.mean(0) + n = data.count(0).astype(float) + # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2)) + data -= m + data **= 2 + data *= (n-1.5)*n + data -= 0.5*v*(n-1) + data /= (n-1.)*(n-2.) + if not ma.allclose(v,data.mean(0)): + raise ValueError("Lack of convergence in obrientransform.") + return data + + +def signaltonoise(data, axis=0): + """Calculates the signal-to-noise ratio, as the ratio of the mean over + standard deviation along the given axis. + + Parameters + ---------- + data : sequence + Input data + axis : {0, int}, optional + Axis along which to compute. If None, the computation is performed + on a flat version of the array. +""" + data = ma.array(data, copy=False) + m = data.mean(axis) + sd = data.std(axis, ddof=0) + return m/sd + + +def sem(a, axis=0, ddof=1): + a, axis = _chk_asarray(a, axis) + n = a.count(axis=axis) + s = a.std(axis=axis, ddof=ddof) / ma.sqrt(n) + return s +sem.__doc__ = stats.sem.__doc__ + +zmap = stats.zmap +zscore = stats.zscore + + +#####-------------------------------------------------------------------------- +#---- --- ANOVA --- +#####-------------------------------------------------------------------------- + + +def f_oneway(*args): + """ +Performs a 1-way ANOVA, returning an F-value and probability given +any number of groups. From Heiman, pp.394-7. + +Usage: f_oneway (*args) where *args is 2 or more arrays, one per + treatment group +Returns: f-value, probability +""" + # Construct a single array of arguments: each row is a group + data = argstoarray(*args) + ngroups = len(data) + ntot = data.count() + sstot = (data**2).sum() - (data.sum())**2/float(ntot) + ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum() + sswg = sstot-ssbg + dfbg = ngroups-1 + dfwg = ntot - ngroups + msb = ssbg/float(dfbg) + msw = sswg/float(dfwg) + f = msb/msw + prob = stats.fprob(dfbg,dfwg,f) + return f, prob + + +def f_value_wilks_lambda(ER, EF, dfnum, dfden, a, b): + """Calculation of Wilks lambda F-statistic for multivarite data, per + Maxwell & Delaney p.657. + """ + ER = ma.array(ER, copy=False, ndmin=2) + EF = ma.array(EF, copy=False, ndmin=2) + if ma.getmask(ER).any() or ma.getmask(EF).any(): + raise NotImplementedError("Not implemented when the inputs " + "have missing data") + lmbda = np.linalg.det(EF) / np.linalg.det(ER) + q = ma.sqrt(((a-1)**2*(b-1)**2 - 2) / ((a-1)**2 + (b-1)**2 - 5)) + q = ma.filled(q, 1) + n_um = (1 - lmbda**(1.0/q))*(a-1)*(b-1) + d_en = lmbda**(1.0/q) / (n_um*q - 0.5*(a-1)*(b-1) + 1) + return n_um / d_en + + +def friedmanchisquare(*args): + """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA. + This function calculates the Friedman Chi-square test for repeated measures + and returns the result, along with the associated probability value. + + Each input is considered a given group. Ideally, the number of treatments + among each group should be equal. If this is not the case, only the first + n treatments are taken into account, where n is the number of treatments + of the smallest group. + If a group has some missing values, the corresponding treatments are masked + in the other groups. + The test statistic is corrected for ties. + + Masked values in one group are propagated to the other groups. + + Returns: chi-square statistic, associated p-value + """ + data = argstoarray(*args).astype(float) + k = len(data) + if k < 3: + raise ValueError("Less than 3 groups (%i): " % k + + "the Friedman test is NOT appropriate.") + ranked = ma.masked_values(rankdata(data, axis=0), 0) + if ranked._mask is not nomask: + ranked = ma.mask_cols(ranked) + ranked = ranked.compressed().reshape(k,-1).view(ndarray) + else: + ranked = ranked._data + (k,n) = ranked.shape + # Ties correction + repeats = np.array([find_repeats(_) for _ in ranked.T], dtype=object) + ties = repeats[repeats.nonzero()].reshape(-1,2)[:,-1].astype(int) + tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k)) + # + ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2) + chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction + return chisq, stats.chisqprob(chisq,k-1) + +#-############################################################################-# diff --git a/pywafo/src/wafo/stats/mstats_extras.py b/pywafo/src/wafo/stats/mstats_extras.py new file mode 100644 index 0000000..e71f99c --- /dev/null +++ b/pywafo/src/wafo/stats/mstats_extras.py @@ -0,0 +1,466 @@ +""" +Additional statistics functions, with support to MA. + +:author: Pierre GF Gerard-Marchant +:contact: pierregm_at_uga_edu +:date: $Date: 2007-10-29 17:18:13 +0200 (Mon, 29 Oct 2007) $ +:version: $Id: morestats.py 3473 2007-10-29 15:18:13Z jarrod.millman $ +""" +from __future__ import division, print_function, absolute_import + +__author__ = "Pierre GF Gerard-Marchant" +__docformat__ = "restructuredtext en" + + +__all__ = ['compare_medians_ms', + 'hdquantiles', 'hdmedian', 'hdquantiles_sd', + 'idealfourths', + 'median_cihs','mjci','mquantiles_cimj', + 'rsh', + 'trimmed_mean_ci',] + +import numpy as np +from numpy import float_, int_, ndarray + +import numpy.ma as ma +from numpy.ma import MaskedArray + +from . import mstats_basic as mstats + +from scipy.stats.distributions import norm, beta, t, binom + + +#####-------------------------------------------------------------------------- +#---- --- Quantiles --- +#####-------------------------------------------------------------------------- +def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,): + """ + Computes quantile estimates with the Harrell-Davis method. + + The quantile estimates are calculated as a weighted linear combination + of order statistics. + + Parameters + ---------- + data : array_like + Data array. + prob : sequence + Sequence of quantiles to compute. + axis : int + Axis along which to compute the quantiles. If None, use a flattened + array. + var : boolean + Whether to return the variance of the estimate. + + Returns + ------- + hdquantiles : MaskedArray + A (p,) array of quantiles (if `var` is False), or a (2,p) array of + quantiles and variances (if `var` is True), where ``p`` is the + number of quantiles. + + """ + def _hd_1D(data,prob,var): + "Computes the HD quantiles for a 1D array. Returns nan for invalid data." + xsorted = np.squeeze(np.sort(data.compressed().view(ndarray))) + # Don't use length here, in case we have a numpy scalar + n = xsorted.size + #......... + hd = np.empty((2,len(prob)), float_) + if n < 2: + hd.flat = np.nan + if var: + return hd + return hd[0] + #......... + v = np.arange(n+1) / float(n) + betacdf = beta.cdf + for (i,p) in enumerate(prob): + _w = betacdf(v, (n+1)*p, (n+1)*(1-p)) + w = _w[1:] - _w[:-1] + hd_mean = np.dot(w, xsorted) + hd[0,i] = hd_mean + # + hd[1,i] = np.dot(w, (xsorted-hd_mean)**2) + # + hd[0, prob == 0] = xsorted[0] + hd[0, prob == 1] = xsorted[-1] + if var: + hd[1, prob == 0] = hd[1, prob == 1] = np.nan + return hd + return hd[0] + # Initialization & checks --------- + data = ma.array(data, copy=False, dtype=float_) + p = np.array(prob, copy=False, ndmin=1) + # Computes quantiles along axis (or globally) + if (axis is None) or (data.ndim == 1): + result = _hd_1D(data, p, var) + else: + if data.ndim > 2: + raise ValueError("Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim) + result = ma.apply_along_axis(_hd_1D, axis, data, p, var) + # + return ma.fix_invalid(result, copy=False) + +#.............................................................................. + + +def hdmedian(data, axis=-1, var=False): + """ + Returns the Harrell-Davis estimate of the median along the given axis. + + Parameters + ---------- + data : ndarray + Data array. + axis : int + Axis along which to compute the quantiles. If None, use a flattened + array. + var : boolean + Whether to return the variance of the estimate. + + """ + result = hdquantiles(data,[0.5], axis=axis, var=var) + return result.squeeze() + + +#.............................................................................. +def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None): + """ + The standard error of the Harrell-Davis quantile estimates by jackknife. + + Parameters + ---------- + data : array_like + Data array. + prob : sequence + Sequence of quantiles to compute. + axis : int + Axis along which to compute the quantiles. If None, use a flattened + array. + + Returns + ------- + hdquantiles_sd : MaskedArray + Standard error of the Harrell-Davis quantile estimates. + + """ + def _hdsd_1D(data,prob): + "Computes the std error for 1D arrays." + xsorted = np.sort(data.compressed()) + n = len(xsorted) + #......... + hdsd = np.empty(len(prob), float_) + if n < 2: + hdsd.flat = np.nan + #......... + vv = np.arange(n) / float(n-1) + betacdf = beta.cdf + # + for (i,p) in enumerate(prob): + _w = betacdf(vv, (n+1)*p, (n+1)*(1-p)) + w = _w[1:] - _w[:-1] + mx_ = np.fromiter([np.dot(w,xsorted[np.r_[list(range(0,k)), + list(range(k+1,n))].astype(int_)]) + for k in range(n)], dtype=float_) + mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1) + hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n)) + return hdsd + # Initialization & checks --------- + data = ma.array(data, copy=False, dtype=float_) + p = np.array(prob, copy=False, ndmin=1) + # Computes quantiles along axis (or globally) + if (axis is None): + result = _hdsd_1D(data, p) + else: + if data.ndim > 2: + raise ValueError("Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim) + result = ma.apply_along_axis(_hdsd_1D, axis, data, p) + # + return ma.fix_invalid(result, copy=False).ravel() + + +#####-------------------------------------------------------------------------- +#---- --- Confidence intervals --- +#####-------------------------------------------------------------------------- + +def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True), + alpha=0.05, axis=None): + """ + Selected confidence interval of the trimmed mean along the given axis. + + Parameters + ---------- + data : array_like + Input data. + limits : {None, tuple}, optional + None or a two item tuple. + Tuple of the percentages to cut on each side of the array, with respect + to the number of unmasked data, as floats between 0. and 1. If ``n`` + is the number of unmasked data before trimming, then + (``n`` * `limits[0]`)th smallest data and (``n`` * `limits[1]`)th + largest data are masked. The total number of unmasked data after + trimming is ``n`` * (1. - sum(`limits`)). + The value of one limit can be set to None to indicate an open interval. + + Defaults to (0.2, 0.2). + inclusive : (2,) tuple of boolean, optional + If relative==False, tuple indicating whether values exactly equal to + the absolute limits are allowed. + If relative==True, tuple indicating whether the number of data being + masked on each side should be rounded (True) or truncated (False). + + Defaults to (True, True). + alpha : float, optional + Confidence level of the intervals. + + Defaults to 0.05. + axis : int, optional + Axis along which to cut. If None, uses a flattened version of `data`. + + Defaults to None. + + Returns + ------- + trimmed_mean_ci : (2,) ndarray + The lower and upper confidence intervals of the trimmed data. + + """ + data = ma.array(data, copy=False) + trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis) + tmean = trimmed.mean(axis) + tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis) + df = trimmed.count(axis) - 1 + tppf = t.ppf(1-alpha/2.,df) + return np.array((tmean - tppf*tstde, tmean+tppf*tstde)) + +#.............................................................................. + + +def mjci(data, prob=[0.25,0.5,0.75], axis=None): + """ + Returns the Maritz-Jarrett estimators of the standard error of selected + experimental quantiles of the data. + + Parameters + ---------- + data: ndarray + Data array. + prob: sequence + Sequence of quantiles to compute. + axis : int + Axis along which to compute the quantiles. If None, use a flattened + array. + + """ + def _mjci_1D(data, p): + data = np.sort(data.compressed()) + n = data.size + prob = (np.array(p) * n + 0.5).astype(int_) + betacdf = beta.cdf + # + mj = np.empty(len(prob), float_) + x = np.arange(1,n+1, dtype=float_) / n + y = x - 1./n + for (i,m) in enumerate(prob): + (m1,m2) = (m-1, n-m) + W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m) + C1 = np.dot(W,data) + C2 = np.dot(W,data**2) + mj[i] = np.sqrt(C2 - C1**2) + return mj + # + data = ma.array(data, copy=False) + if data.ndim > 2: + raise ValueError("Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim) + p = np.array(prob, copy=False, ndmin=1) + # Computes quantiles along axis (or globally) + if (axis is None): + return _mjci_1D(data, p) + else: + return ma.apply_along_axis(_mjci_1D, axis, data, p) + +#.............................................................................. + + +def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None): + """ + Computes the alpha confidence interval for the selected quantiles of the + data, with Maritz-Jarrett estimators. + + Parameters + ---------- + data : ndarray + Data array. + prob : sequence + Sequence of quantiles to compute. + alpha : float + Confidence level of the intervals. + axis : integer + Axis along which to compute the quantiles. + If None, use a flattened array. + + """ + alpha = min(alpha, 1-alpha) + z = norm.ppf(1-alpha/2.) + xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis) + smj = mjci(data, prob, axis=axis) + return (xq - z * smj, xq + z * smj) + + +#............................................................................. +def median_cihs(data, alpha=0.05, axis=None): + """ + Computes the alpha-level confidence interval for the median of the data. + + Uses the Hettmasperger-Sheather method. + + Parameters + ---------- + data : array_like + Input data. Masked values are discarded. The input should be 1D only, + or `axis` should be set to None. + alpha : float + Confidence level of the intervals. + axis : integer + Axis along which to compute the quantiles. If None, use a flattened + array. + + Returns + ------- + median_cihs : + Alpha level confidence interval. + + """ + def _cihs_1D(data, alpha): + data = np.sort(data.compressed()) + n = len(data) + alpha = min(alpha, 1-alpha) + k = int(binom._ppf(alpha/2., n, 0.5)) + gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) + if gk < 1-alpha: + k -= 1 + gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) + gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5) + I = (gk - 1 + alpha)/(gk - gkk) + lambd = (n-k) * I / float(k + (n-2*k)*I) + lims = (lambd*data[k] + (1-lambd)*data[k-1], + lambd*data[n-k-1] + (1-lambd)*data[n-k]) + return lims + data = ma.rray(data, copy=False) + # Computes quantiles along axis (or globally) + if (axis is None): + result = _cihs_1D(data.compressed(), alpha) + else: + if data.ndim > 2: + raise ValueError("Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim) + result = ma.apply_along_axis(_cihs_1D, axis, data, alpha) + # + return result + +#.............................................................................. + + +def compare_medians_ms(group_1, group_2, axis=None): + """ + Compares the medians from two independent groups along the given axis. + + The comparison is performed using the McKean-Schrader estimate of the + standard error of the medians. + + Parameters + ---------- + group_1 : array_like + First dataset. + group_2 : array_like + Second dataset. + axis : int, optional + Axis along which the medians are estimated. If None, the arrays are + flattened. If `axis` is not None, then `group_1` and `group_2` + should have the same shape. + + Returns + ------- + compare_medians_ms : {float, ndarray} + If `axis` is None, then returns a float, otherwise returns a 1-D + ndarray of floats with a length equal to the length of `group_1` + along `axis`. + + """ + (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis)) + (std_1, std_2) = (mstats.stde_median(group_1, axis=axis), + mstats.stde_median(group_2, axis=axis)) + W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2) + return 1 - norm.cdf(W) + + +def idealfourths(data, axis=None): + """ + Returns an estimate of the lower and upper quartiles. + + Uses the ideal fourths algorithm. + + Parameters + ---------- + data : array_like + Input array. + axis : int, optional + Axis along which the quartiles are estimated. If None, the arrays are + flattened. + + Returns + ------- + idealfourths : {list of floats, masked array} + Returns the two internal values that divide `data` into four parts + using the ideal fourths algorithm either along the flattened array + (if `axis` is None) or along `axis` of `data`. + + """ + def _idf(data): + x = data.compressed() + n = len(x) + if n < 3: + return [np.nan,np.nan] + (j,h) = divmod(n/4. + 5/12.,1) + j = int(j) + qlo = (1-h)*x[j-1] + h*x[j] + k = n - j + qup = (1-h)*x[k] + h*x[k-1] + return [qlo, qup] + data = ma.sort(data, axis=axis).view(MaskedArray) + if (axis is None): + return _idf(data) + else: + return ma.apply_along_axis(_idf, axis, data) + + +def rsh(data, points=None): + """ + Evaluates Rosenblatt's shifted histogram estimators for each point + on the dataset 'data'. + + Parameters + ---------- + data : sequence + Input data. Masked values are ignored. + points : sequence + Sequence of points where to evaluate Rosenblatt shifted histogram. + If None, use the data. + + """ + data = ma.array(data, copy=False) + if points is None: + points = data + else: + points = np.array(points, copy=False, ndmin=1) + if data.ndim != 1: + raise AttributeError("The input array should be 1D only !") + n = data.count() + r = idealfourths(data, axis=None) + h = 1.2 * (r[-1]-r[0]) / n**(1./5) + nhi = (data[:,None] <= points[None,:] + h).sum(0) + nlo = (data[:,None] < points[None,:] - h).sum(0) + return (nhi-nlo) / (2.*n*h) + + +############################################################################### diff --git a/pywafo/src/wafo/stats/rv.py b/pywafo/src/wafo/stats/rv.py new file mode 100644 index 0000000..1ef8b36 --- /dev/null +++ b/pywafo/src/wafo/stats/rv.py @@ -0,0 +1,76 @@ +from __future__ import division, print_function, absolute_import + +from numpy import vectorize, deprecate +from numpy.random import random_sample + +__all__ = ['randwppf', 'randwcdf'] + +# XXX: Are these needed anymore? + +##################################### +# General purpose continuous +###################################### + + +@deprecate(message="Deprecated in scipy 0.14.0, use " + "distribution-specific rvs() method instead") +def randwppf(ppf, args=(), size=None): + """ + returns an array of randomly distributed integers of a distribution + whose percent point function (inverse of the CDF or quantile function) + is given. + + args is a tuple of extra arguments to the ppf function (i.e. shape, + location, scale), and size is the size of the output. Note the ppf + function must accept an array of q values to compute over. + + """ + U = random_sample(size=size) + return ppf(*(U,)+args) + + +@deprecate(message="Deprecated in scipy 0.14.0, use " + "distribution-specific rvs() method instead") +def randwcdf(cdf, mean=1.0, args=(), size=None): + """ + Returns an array of randomly distributed integers given a CDF. + + Given a cumulative distribution function (CDF) returns an array of + randomly distributed integers that would satisfy the CDF. + + Parameters + ---------- + cdf : function + CDF function that accepts a single value and `args`, and returns + an single value. + mean : float, optional + The mean of the distribution which helps the solver. Defaults + to 1.0. + args : tuple, optional + Extra arguments to the cdf function (i.e. shape, location, scale) + size : {int, None}, optional + Is the size of the output. If None, only 1 value will be returned. + + Returns + ------- + randwcdf : ndarray + Array of random numbers. + + Notes + ----- + Can use the ``scipy.stats.distributions.*.cdf`` functions for the + `cdf` parameter. + + """ + import scipy.optimize as optimize + + def _ppfopt(x, q, *nargs): + newargs = (x,)+nargs + return cdf(*newargs) - q + + def _ppf(q, *nargs): + return optimize.fsolve(_ppfopt, mean, args=(q,)+nargs) + + _vppf = vectorize(_ppf) + U = random_sample(size=size) + return _vppf(*(U,)+args) diff --git a/pywafo/src/wafo/stats/six.py b/pywafo/src/wafo/stats/six.py index c537546..f0bc0b6 100644 --- a/pywafo/src/wafo/stats/six.py +++ b/pywafo/src/wafo/stats/six.py @@ -307,7 +307,7 @@ _add_doc(u, """Text literal""") if PY3: - import builtins + import builtins # @UnresolvedImport exec_ = getattr(builtins, "exec") def reraise(tp, value, tb=None): diff --git a/pywafo/src/wafo/stats/stats.py b/pywafo/src/wafo/stats/stats.py new file mode 100644 index 0000000..8e7f1b6 --- /dev/null +++ b/pywafo/src/wafo/stats/stats.py @@ -0,0 +1,4354 @@ +# Copyright (c) Gary Strangman. All rights reserved +# +# Disclaimer +# +# This software is provided "as-is". There are no expressed or implied +# warranties of any kind, including, but not limited to, the warranties +# of merchantability and fitness for a given application. In no event +# shall Gary Strangman be liable for any direct, indirect, incidental, +# special, exemplary or consequential damages (including, but not limited +# to, loss of use, data or profits, or business interruption) however +# caused and on any theory of liability, whether in contract, strict +# liability or tort (including negligence or otherwise) arising in any way +# out of the use of this software, even if advised of the possibility of +# such damage. +# + +# +# Heavily adapted for use by SciPy 2002 by Travis Oliphant +""" +A collection of basic statistical functions for python. The function +names appear below. + + Some scalar functions defined here are also available in the scipy.special + package where they work on arbitrary sized arrays. + +Disclaimers: The function list is obviously incomplete and, worse, the +functions are not optimized. All functions have been tested (some more +so than others), but they are far from bulletproof. Thus, as with any +free software, no warranty or guarantee is expressed or implied. :-) A +few extra functions that don't appear in the list below can be found by +interested treasure-hunters. These functions don't necessarily have +both list and array versions but were deemed useful. + +Central Tendency +---------------- +.. autosummary:: + :toctree: generated/ + + gmean + hmean + mode + +Moments +------- +.. autosummary:: + :toctree: generated/ + + moment + variation + skew + kurtosis + normaltest + +Moments Handling NaN: + +.. autosummary:: + :toctree: generated/ + + nanmean + nanmedian + nanstd + +Altered Versions +---------------- +.. autosummary:: + :toctree: generated/ + + tmean + tvar + tstd + tsem + describe + +Frequency Stats +--------------- +.. autosummary:: + :toctree: generated/ + + itemfreq + scoreatpercentile + percentileofscore + histogram + cumfreq + relfreq + +Variability +----------- +.. autosummary:: + :toctree: generated/ + + obrientransform + signaltonoise + sem + +Trimming Functions +------------------ +.. autosummary:: + :toctree: generated/ + + threshold + trimboth + trim1 + +Correlation Functions +--------------------- +.. autosummary:: + :toctree: generated/ + + pearsonr + fisher_exact + spearmanr + pointbiserialr + kendalltau + linregress + +Inferential Stats +----------------- +.. autosummary:: + :toctree: generated/ + + ttest_1samp + ttest_ind + ttest_rel + chisquare + power_divergence + ks_2samp + mannwhitneyu + ranksums + wilcoxon + kruskal + friedmanchisquare + +Probability Calculations +------------------------ +.. autosummary:: + :toctree: generated/ + + chisqprob + zprob + fprob + betai + +ANOVA Functions +--------------- +.. autosummary:: + :toctree: generated/ + + f_oneway + f_value + +Support Functions +----------------- +.. autosummary:: + :toctree: generated/ + + ss + square_of_sums + rankdata + +References +---------- +.. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard + Probability and Statistics Tables and Formulae. Chapman & Hall: New + York. 2000. + +""" + +from __future__ import division, print_function, absolute_import + +import warnings +import math + +#from .six import xrange + +# friedmanchisquare patch uses python sum +pysum = sum # save it before it gets overwritten + +# Scipy imports. +from scipy.lib.six import callable, string_types +from numpy import array, asarray, ma, zeros, sum +import scipy.special as special +import scipy.linalg as linalg +import numpy as np + +from . import futil +from . import distributions +try: + from ._rank import rankdata, tiecorrect +except: + rankdata=tiecorrect=None +__all__ = ['find_repeats', 'gmean', 'hmean', 'mode', + 'tmean', 'tvar', 'tmin', 'tmax', 'tstd', 'tsem', + 'moment', 'variation', 'skew', 'kurtosis', 'describe', + 'skewtest', 'kurtosistest', 'normaltest', 'jarque_bera', + 'itemfreq', 'scoreatpercentile', 'percentileofscore', + 'histogram', 'histogram2', 'cumfreq', 'relfreq', + 'obrientransform', 'signaltonoise', 'sem', 'zmap', 'zscore', + 'threshold', 'sigmaclip', 'trimboth', 'trim1', 'trim_mean', + 'f_oneway', 'pearsonr', 'fisher_exact', + 'spearmanr', 'pointbiserialr', 'kendalltau', 'linregress', + 'ttest_1samp', 'ttest_ind', 'ttest_rel', 'kstest', + 'chisquare', 'power_divergence', 'ks_2samp', 'mannwhitneyu', + 'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare', + 'zprob', 'chisqprob', 'ksprob', 'fprob', 'betai', + 'f_value_wilks_lambda', 'f_value', 'f_value_multivariate', + 'ss', 'square_of_sums', + 'fastsort', 'rankdata', + 'nanmean', 'nanstd', 'nanmedian', + ] + + +def _chk_asarray(a, axis): + if axis is None: + a = np.ravel(a) + outaxis = 0 + else: + a = np.asarray(a) + outaxis = axis + return a, outaxis + + +def _chk2_asarray(a, b, axis): + if axis is None: + a = np.ravel(a) + b = np.ravel(b) + outaxis = 0 + else: + a = np.asarray(a) + b = np.asarray(b) + outaxis = axis + return a, b, outaxis + + +def find_repeats(arr): + """ + Find repeats and repeat counts. + + Parameters + ---------- + arr : array_like + Input array + + Returns + ------- + find_repeats : tuple + Returns a tuple of two 1-D ndarrays. The first ndarray are the repeats + as sorted, unique values that are repeated in `arr`. The second + ndarray are the counts mapped one-to-one of the repeated values + in the first ndarray. + + Examples + -------- + >>> sp.stats.find_repeats([2, 1, 2, 3, 2, 2, 5]) + (array([ 2. ]), array([ 4 ], dtype=int32) + + >>> sp.stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]]) + (array([ 4., 5.]), array([2, 2], dtype=int32)) + + """ + v1,v2, n = futil.dfreps(arr) + return v1[:n],v2[:n] + +####### +### NAN friendly functions +######## + + +def nanmean(x, axis=0): + """ + Compute the mean over the given axis ignoring nans. + + Parameters + ---------- + x : ndarray + Input array. + axis : int, optional + Axis along which the mean is computed. Default is 0, i.e. the + first axis. + + Returns + ------- + m : float + The mean of `x`, ignoring nans. + + See Also + -------- + nanstd, nanmedian + + Examples + -------- + >>> from scipy import stats + >>> a = np.linspace(0, 4, 3) + >>> a + array([ 0., 2., 4.]) + >>> a[-1] = np.nan + >>> stats.nanmean(a) + 1.0 + + """ + x, axis = _chk_asarray(x, axis) + x = x.copy() + Norig = x.shape[axis] + mask = np.isnan(x) + factor = 1.0 - np.sum(mask, axis) / Norig + + x[mask] = 0.0 + return np.mean(x, axis) / factor + + +def nanstd(x, axis=0, bias=False): + """ + Compute the standard deviation over the given axis, ignoring nans. + + Parameters + ---------- + x : array_like + Input array. + axis : int or None, optional + Axis along which the standard deviation is computed. Default is 0. + If None, compute over the whole array `x`. + bias : bool, optional + If True, the biased (normalized by N) definition is used. If False + (default), the unbiased definition is used. + + Returns + ------- + s : float + The standard deviation. + + See Also + -------- + nanmean, nanmedian + + Examples + -------- + >>> from scipy import stats + >>> a = np.arange(10, dtype=float) + >>> a[1:3] = np.nan + >>> np.std(a) + nan + >>> stats.nanstd(a) + 2.9154759474226504 + >>> stats.nanstd(a.reshape(2, 5), axis=1) + array([ 2.0817, 1.5811]) + >>> stats.nanstd(a.reshape(2, 5), axis=None) + 2.9154759474226504 + + """ + x, axis = _chk_asarray(x, axis) + x = x.copy() + Norig = x.shape[axis] + + mask = np.isnan(x) + Nnan = np.sum(mask, axis) * 1.0 + n = Norig - Nnan + + x[mask] = 0.0 + m1 = np.sum(x, axis) / n + + if axis: + d = x - np.expand_dims(m1, axis) + else: + d = x - m1 + + d *= d + + m2 = np.sum(d, axis) - m1 * m1 * Nnan + + if bias: + m2c = m2 / n + else: + m2c = m2 / (n - 1.0) + + return np.sqrt(m2c) + + +def _nanmedian(arr1d): # This only works on 1d arrays + """Private function for rank a arrays. Compute the median ignoring Nan. + + Parameters + ---------- + arr1d : ndarray + Input array, of rank 1. + + Results + ------- + m : float + The median. + """ + cond = 1-np.isnan(arr1d) + x = np.sort(np.compress(cond,arr1d,axis=-1)) + if x.size == 0: + return np.nan + return np.median(x) + + +def nanmedian(x, axis=0): + """ + Compute the median along the given axis ignoring nan values. + + Parameters + ---------- + x : array_like + Input array. + axis : int, optional + Axis along which the median is computed. Default is 0, i.e. the + first axis. + + Returns + ------- + m : float + The median of `x` along `axis`. + + See Also + -------- + nanstd, nanmean + + Examples + -------- + >>> from scipy import stats + >>> a = np.array([0, 3, 1, 5, 5, np.nan]) + >>> stats.nanmedian(a) + array(3.0) + + >>> b = np.array([0, 3, 1, 5, 5, np.nan, 5]) + >>> stats.nanmedian(b) + array(4.0) + + Example with axis: + + >>> c = np.arange(30.).reshape(5,6) + >>> idx = np.array([False, False, False, True, False] * 6).reshape(5,6) + >>> c[idx] = np.nan + >>> c + array([[ 0., 1., 2., nan, 4., 5.], + [ 6., 7., nan, 9., 10., 11.], + [ 12., nan, 14., 15., 16., 17.], + [ nan, 19., 20., 21., 22., nan], + [ 24., 25., 26., 27., nan, 29.]]) + >>> stats.nanmedian(c, axis=1) + array([ 2. , 9. , 15. , 20.5, 26. ]) + + """ + x, axis = _chk_asarray(x, axis) + if x.ndim == 0: + return float(x.item()) + x = x.copy() + x = np.apply_along_axis(_nanmedian, axis, x) + if x.ndim == 0: + x = float(x.item()) + return x + + +##################################### +######## CENTRAL TENDENCY ######## +##################################### + + +def gmean(a, axis=0, dtype=None): + """ + Compute the geometric mean along the specified axis. + + Returns the geometric average of the array elements. + That is: n-th root of (x1 * x2 * ... * xn) + + Parameters + ---------- + a : array_like + Input array or object that can be converted to an array. + axis : int, optional, default axis=0 + Axis along which the geometric mean is computed. + dtype : dtype, optional + Type of the returned array and of the accumulator in which the + elements are summed. If dtype is not specified, it defaults to the + dtype of a, unless a has an integer dtype with a precision less than + that of the default platform integer. In that case, the default + platform integer is used. + + Returns + ------- + gmean : ndarray + see dtype parameter above + + See Also + -------- + numpy.mean : Arithmetic average + numpy.average : Weighted average + hmean : Harmonic mean + + Notes + ----- + The geometric average is computed over a single dimension of the input + array, axis=0 by default, or all values in the array if axis=None. + float64 intermediate and return values are used for integer inputs. + + Use masked arrays to ignore any non-finite values in the input or that + arise in the calculations such as Not a Number and infinity because masked + arrays automatically mask any non-finite values. + + """ + if not isinstance(a, np.ndarray): # if not an ndarray object attempt to convert it + log_a = np.log(np.array(a, dtype=dtype)) + elif dtype: # Must change the default dtype allowing array type + if isinstance(a,np.ma.MaskedArray): + log_a = np.log(np.ma.asarray(a, dtype=dtype)) + else: + log_a = np.log(np.asarray(a, dtype=dtype)) + else: + log_a = np.log(a) + return np.exp(log_a.mean(axis=axis)) + + +def hmean(a, axis=0, dtype=None): + """ + Calculates the harmonic mean along the specified axis. + + That is: n / (1/x1 + 1/x2 + ... + 1/xn) + + Parameters + ---------- + a : array_like + Input array, masked array or object that can be converted to an array. + axis : int, optional, default axis=0 + Axis along which the harmonic mean is computed. + dtype : dtype, optional + Type of the returned array and of the accumulator in which the + elements are summed. If `dtype` is not specified, it defaults to the + dtype of `a`, unless `a` has an integer `dtype` with a precision less + than that of the default platform integer. In that case, the default + platform integer is used. + + Returns + ------- + hmean : ndarray + see `dtype` parameter above + + See Also + -------- + numpy.mean : Arithmetic average + numpy.average : Weighted average + gmean : Geometric mean + + Notes + ----- + The harmonic mean is computed over a single dimension of the input + array, axis=0 by default, or all values in the array if axis=None. + float64 intermediate and return values are used for integer inputs. + + Use masked arrays to ignore any non-finite values in the input or that + arise in the calculations such as Not a Number and infinity. + + """ + if not isinstance(a, np.ndarray): + a = np.array(a, dtype=dtype) + if np.all(a > 0): # Harmonic mean only defined if greater than zero + if isinstance(a, np.ma.MaskedArray): + size = a.count(axis) + else: + if axis is None: + a = a.ravel() + size = a.shape[0] + else: + size = a.shape[axis] + return size / np.sum(1.0/a, axis=axis, dtype=dtype) + else: + raise ValueError("Harmonic mean only defined if all elements greater than zero") + + +def mode(a, axis=0): + """ + Returns an array of the modal (most common) value in the passed array. + + If there is more than one such value, only the first is returned. + The bin-count for the modal bins is also returned. + + Parameters + ---------- + a : array_like + n-dimensional array of which to find mode(s). + axis : int, optional + Axis along which to operate. Default is 0, i.e. the first axis. + + Returns + ------- + vals : ndarray + Array of modal values. + counts : ndarray + Array of counts for each mode. + + Examples + -------- + >>> a = np.array([[6, 8, 3, 0], + [3, 2, 1, 7], + [8, 1, 8, 4], + [5, 3, 0, 5], + [4, 7, 5, 9]]) + >>> from scipy import stats + >>> stats.mode(a) + (array([[ 3., 1., 0., 0.]]), array([[ 1., 1., 1., 1.]])) + + To get mode of whole array, specify axis=None: + + >>> stats.mode(a, axis=None) + (array([ 3.]), array([ 3.])) + + """ + a, axis = _chk_asarray(a, axis) + scores = np.unique(np.ravel(a)) # get ALL unique values + testshape = list(a.shape) + testshape[axis] = 1 + oldmostfreq = np.zeros(testshape) + oldcounts = np.zeros(testshape) + for score in scores: + template = (a == score) + counts = np.expand_dims(np.sum(template, axis),axis) + mostfrequent = np.where(counts > oldcounts, score, oldmostfreq) + oldcounts = np.maximum(counts, oldcounts) + oldmostfreq = mostfrequent + return mostfrequent, oldcounts + + +def mask_to_limits(a, limits, inclusive): + """Mask an array for values outside of given limits. + + This is primarily a utility function. + + Parameters + ---------- + a : array + limits : (float or None, float or None) + A tuple consisting of the (lower limit, upper limit). Values in the + input array less than the lower limit or greater than the upper limit + will be masked out. None implies no limit. + inclusive : (bool, bool) + A tuple consisting of the (lower flag, upper flag). These flags + determine whether values exactly equal to lower or upper are allowed. + + Returns + ------- + A MaskedArray. + + Raises + ------ + A ValueError if there are no values within the given limits. + """ + lower_limit, upper_limit = limits + lower_include, upper_include = inclusive + am = ma.MaskedArray(a) + if lower_limit is not None: + if lower_include: + am = ma.masked_less(am, lower_limit) + else: + am = ma.masked_less_equal(am, lower_limit) + + if upper_limit is not None: + if upper_include: + am = ma.masked_greater(am, upper_limit) + else: + am = ma.masked_greater_equal(am, upper_limit) + + if am.count() == 0: + raise ValueError("No array values within given limits") + + return am + + +def tmean(a, limits=None, inclusive=(True, True)): + """ + Compute the trimmed mean. + + This function finds the arithmetic mean of given values, ignoring values + outside the given `limits`. + + Parameters + ---------- + a : array_like + Array of values. + limits : None or (lower limit, upper limit), optional + Values in the input array less than the lower limit or greater than the + upper limit will be ignored. When limits is None (default), then all + values are used. Either of the limit values in the tuple can also be + None representing a half-open interval. + inclusive : (bool, bool), optional + A tuple consisting of the (lower flag, upper flag). These flags + determine whether values exactly equal to the lower or upper limits + are included. The default value is (True, True). + + Returns + ------- + tmean : float + + """ + a = asarray(a) + if limits is None: + return np.mean(a, None) + + am = mask_to_limits(a.ravel(), limits, inclusive) + return am.mean() + + +def masked_var(am): + m = am.mean() + s = ma.add.reduce((am - m)**2) + n = am.count() - 1.0 + return s / n + + +def tvar(a, limits=None, inclusive=(True, True)): + """ + Compute the trimmed variance + + This function computes the sample variance of an array of values, + while ignoring values which are outside of given `limits`. + + Parameters + ---------- + a : array_like + Array of values. + limits : None or (lower limit, upper limit), optional + Values in the input array less than the lower limit or greater than the + upper limit will be ignored. When limits is None, then all values are + used. Either of the limit values in the tuple can also be None + representing a half-open interval. The default value is None. + inclusive : (bool, bool), optional + A tuple consisting of the (lower flag, upper flag). These flags + determine whether values exactly equal to the lower or upper limits + are included. The default value is (True, True). + + Returns + ------- + tvar : float + Trimmed variance. + + Notes + ----- + `tvar` computes the unbiased sample variance, i.e. it uses a correction + factor ``n / (n - 1)``. + + """ + a = asarray(a) + a = a.astype(float).ravel() + if limits is None: + n = len(a) + return a.var()*(n/(n-1.)) + am = mask_to_limits(a, limits, inclusive) + return masked_var(am) + + +def tmin(a, lowerlimit=None, axis=0, inclusive=True): + """ + Compute the trimmed minimum + + This function finds the miminum value of an array `a` along the + specified axis, but only considering values greater than a specified + lower limit. + + Parameters + ---------- + a : array_like + array of values + lowerlimit : None or float, optional + Values in the input array less than the given limit will be ignored. + When lowerlimit is None, then all values are used. The default value + is None. + axis : None or int, optional + Operate along this axis. None means to use the flattened array and + the default is zero + inclusive : {True, False}, optional + This flag determines whether values exactly equal to the lower limit + are included. The default value is True. + + Returns + ------- + tmin : float + + """ + a, axis = _chk_asarray(a, axis) + am = mask_to_limits(a, (lowerlimit, None), (inclusive, False)) + return ma.minimum.reduce(am, axis) + + +def tmax(a, upperlimit=None, axis=0, inclusive=True): + """ + Compute the trimmed maximum + + This function computes the maximum value of an array along a given axis, + while ignoring values larger than a specified upper limit. + + Parameters + ---------- + a : array_like + array of values + upperlimit : None or float, optional + Values in the input array greater than the given limit will be ignored. + When upperlimit is None, then all values are used. The default value + is None. + axis : None or int, optional + Operate along this axis. None means to use the flattened array and + the default is zero. + inclusive : {True, False}, optional + This flag determines whether values exactly equal to the upper limit + are included. The default value is True. + + Returns + ------- + tmax : float + + """ + a, axis = _chk_asarray(a, axis) + am = mask_to_limits(a, (None, upperlimit), (False, inclusive)) + return ma.maximum.reduce(am, axis) + + +def tstd(a, limits=None, inclusive=(True, True)): + """ + Compute the trimmed sample standard deviation + + This function finds the sample standard deviation of given values, + ignoring values outside the given `limits`. + + Parameters + ---------- + a : array_like + array of values + limits : None or (lower limit, upper limit), optional + Values in the input array less than the lower limit or greater than the + upper limit will be ignored. When limits is None, then all values are + used. Either of the limit values in the tuple can also be None + representing a half-open interval. The default value is None. + inclusive : (bool, bool), optional + A tuple consisting of the (lower flag, upper flag). These flags + determine whether values exactly equal to the lower or upper limits + are included. The default value is (True, True). + + Returns + ------- + tstd : float + + Notes + ----- + `tstd` computes the unbiased sample standard deviation, i.e. it uses a + correction factor ``n / (n - 1)``. + + """ + return np.sqrt(tvar(a, limits, inclusive)) + + +def tsem(a, limits=None, inclusive=(True, True)): + """ + Compute the trimmed standard error of the mean. + + This function finds the standard error of the mean for given + values, ignoring values outside the given `limits`. + + Parameters + ---------- + a : array_like + array of values + limits : None or (lower limit, upper limit), optional + Values in the input array less than the lower limit or greater than the + upper limit will be ignored. When limits is None, then all values are + used. Either of the limit values in the tuple can also be None + representing a half-open interval. The default value is None. + inclusive : (bool, bool), optional + A tuple consisting of the (lower flag, upper flag). These flags + determine whether values exactly equal to the lower or upper limits + are included. The default value is (True, True). + + Returns + ------- + tsem : float + + Notes + ----- + `tsem` uses unbiased sample standard deviation, i.e. it uses a + correction factor ``n / (n - 1)``. + + """ + a = np.asarray(a).ravel() + if limits is None: + return a.std(ddof=1) / np.sqrt(a.size) + + am = mask_to_limits(a, limits, inclusive) + sd = np.sqrt(masked_var(am)) + return sd / np.sqrt(am.count()) + + +##################################### +############ MOMENTS ############# +##################################### + +def moment(a, moment=1, axis=0): + """ + Calculates the nth moment about the mean for a sample. + + Generally used to calculate coefficients of skewness and + kurtosis. + + Parameters + ---------- + a : array_like + data + moment : int + order of central moment that is returned + axis : int or None + Axis along which the central moment is computed. If None, then the data + array is raveled. The default axis is zero. + + Returns + ------- + n-th central moment : ndarray or float + The appropriate moment along the given axis or over all values if axis + is None. The denominator for the moment calculation is the number of + observations, no degrees of freedom correction is done. + + """ + a, axis = _chk_asarray(a, axis) + if moment == 1: + # By definition the first moment about the mean is 0. + shape = list(a.shape) + del shape[axis] + if shape: + # return an actual array of the appropriate shape + return np.zeros(shape, dtype=float) + else: + # the input was 1D, so return a scalar instead of a rank-0 array + return np.float64(0.0) + else: + mn = np.expand_dims(np.mean(a,axis), axis) + s = np.power((a-mn), moment) + return np.mean(s, axis) + + +def variation(a, axis=0): + """ + Computes the coefficient of variation, the ratio of the biased standard + deviation to the mean. + + Parameters + ---------- + a : array_like + Input array. + axis : int or None + Axis along which to calculate the coefficient of variation. + + References + ---------- + .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard + Probability and Statistics Tables and Formulae. Chapman & Hall: New + York. 2000. + + """ + a, axis = _chk_asarray(a, axis) + return a.std(axis)/a.mean(axis) + + +def skew(a, axis=0, bias=True): + """ + Computes the skewness of a data set. + + For normally distributed data, the skewness should be about 0. A skewness + value > 0 means that there is more weight in the left tail of the + distribution. The function `skewtest` can be used to determine if the + skewness value is close enough to 0, statistically speaking. + + Parameters + ---------- + a : ndarray + data + axis : int or None + axis along which skewness is calculated + bias : bool + If False, then the calculations are corrected for statistical bias. + + Returns + ------- + skewness : ndarray + The skewness of values along an axis, returning 0 where all values are + equal. + + References + ---------- + [CRCProbStat2000]_ Section 2.2.24.1 + + .. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard + Probability and Statistics Tables and Formulae. Chapman & Hall: New + York. 2000. + + """ + a, axis = _chk_asarray(a,axis) + n = a.shape[axis] + m2 = moment(a, 2, axis) + m3 = moment(a, 3, axis) + zero = (m2 == 0) + vals = np.where(zero, 0, m3 / m2**1.5) + if not bias: + can_correct = (n > 2) & (m2 > 0) + if can_correct.any(): + m2 = np.extract(can_correct, m2) + m3 = np.extract(can_correct, m3) + nval = np.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5 + np.place(vals, can_correct, nval) + if vals.ndim == 0: + return vals.item() + return vals + + +def kurtosis(a, axis=0, fisher=True, bias=True): + """ + Computes the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + Use `kurtosistest` to see if result is close enough to normal. + + Parameters + ---------- + a : array + data for which the kurtosis is calculated + axis : int or None + Axis along which the kurtosis is calculated + fisher : bool + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool + If False, then the calculations are corrected for statistical bias. + + Returns + ------- + kurtosis : array + The kurtosis of values along an axis. If all values are equal, + return -3 for Fisher's definition and 0 for Pearson's definition. + + References + ---------- + .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard + Probability and Statistics Tables and Formulae. Chapman & Hall: New + York. 2000. + + """ + a, axis = _chk_asarray(a, axis) + n = a.shape[axis] + m2 = moment(a,2,axis) + m4 = moment(a,4,axis) + zero = (m2 == 0) + olderr = np.seterr(all='ignore') + try: + vals = np.where(zero, 0, m4 / m2**2.0) + finally: + np.seterr(**olderr) + + if not bias: + can_correct = (n > 3) & (m2 > 0) + if can_correct.any(): + m2 = np.extract(can_correct, m2) + m4 = np.extract(can_correct, m4) + nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0) + np.place(vals, can_correct, nval+3.0) + + if vals.ndim == 0: + vals = vals.item() # array scalar + + if fisher: + return vals - 3 + else: + return vals + + +def describe(a, axis=0): + """ + Computes several descriptive statistics of the passed array. + + Parameters + ---------- + a : array_like + data + axis : int or None + axis along which statistics are calculated. If axis is None, then data + array is raveled. The default axis is zero. + + Returns + ------- + size of the data : int + length of data along axis + (min, max): tuple of ndarrays or floats + minimum and maximum value of data array + arithmetic mean : ndarray or float + mean of data along axis + unbiased variance : ndarray or float + variance of the data along axis, denominator is number of observations + minus one. + biased skewness : ndarray or float + skewness, based on moment calculations with denominator equal to the + number of observations, i.e. no degrees of freedom correction + biased kurtosis : ndarray or float + kurtosis (Fisher), the kurtosis is normalized so that it is zero for the + normal distribution. No degrees of freedom or bias correction is used. + + See Also + -------- + skew + kurtosis + + """ + a, axis = _chk_asarray(a, axis) + n = a.shape[axis] + mm = (np.min(a, axis=axis), np.max(a, axis=axis)) + m = np.mean(a, axis=axis) + v = np.var(a, axis=axis, ddof=1) + sk = skew(a, axis) + kurt = kurtosis(a, axis) + return n, mm, m, v, sk, kurt + +##################################### +######## NORMALITY TESTS ########## +##################################### + + +def skewtest(a, axis=0): + """ + Tests whether the skew is different from the normal distribution. + + This function tests the null hypothesis that the skewness of + the population that the sample was drawn from is the same + as that of a corresponding normal distribution. + + Parameters + ---------- + a : array + axis : int or None + + Returns + ------- + z-score : float + The computed z-score for this test. + p-value : float + a 2-sided p-value for the hypothesis test + + Notes + ----- + The sample size must be at least 8. + + """ + a, axis = _chk_asarray(a, axis) + if axis is None: + a = np.ravel(a) + axis = 0 + b2 = skew(a, axis) + n = float(a.shape[axis]) + if n < 8: + raise ValueError( + "skewtest is not valid with less than 8 samples; %i samples" + " were given." % int(n)) + y = b2 * math.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2))) + beta2 = (3.0 * (n * n + 27 * n - 70) * (n + 1) * (n + 3) / + ((n - 2.0) * (n + 5) * (n + 7) * (n + 9))) + W2 = -1 + math.sqrt(2 * (beta2 - 1)) + delta = 1 / math.sqrt(0.5 * math.log(W2)) + alpha = math.sqrt(2.0 / (W2 - 1)) + y = np.where(y == 0, 1, y) + Z = delta * np.log(y / alpha + np.sqrt((y / alpha) ** 2 + 1)) + return Z, 2 * distributions.norm.sf(np.abs(Z)) + + +def kurtosistest(a, axis=0): + """ + Tests whether a dataset has normal kurtosis + + This function tests the null hypothesis that the kurtosis + of the population from which the sample was drawn is that + of the normal distribution: ``kurtosis = 3(n-1)/(n+1)``. + + Parameters + ---------- + a : array + array of the sample data + axis : int or None + the axis to operate along, or None to work on the whole array. + The default is the first axis. + + Returns + ------- + z-score : float + The computed z-score for this test. + p-value : float + The 2-sided p-value for the hypothesis test + + Notes + ----- + Valid only for n>20. The Z-score is set to 0 for bad entries. + + """ + a, axis = _chk_asarray(a, axis) + n = float(a.shape[axis]) + if n < 5: + raise ValueError( + "kurtosistest requires at least 5 observations; %i observations" + " were given." % int(n)) + if n < 20: + warnings.warn( + "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" % + int(n)) + b2 = kurtosis(a, axis, fisher=False) + E = 3.0*(n-1) / (n+1) + varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1)*(n+3)*(n+5)) + x = (b2-E)/np.sqrt(varb2) + sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) / + (n*(n-2)*(n-3))) + A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2))) + term1 = 1 - 2/(9.0*A) + denom = 1 + x*np.sqrt(2/(A-4.0)) + denom = np.where(denom < 0, 99, denom) + term2 = np.where(denom < 0, term1, np.power((1-2.0/A)/denom,1/3.0)) + Z = (term1 - term2) / np.sqrt(2/(9.0*A)) + Z = np.where(denom == 99, 0, Z) + if Z.ndim == 0: + Z = Z[()] + # JPNote: p-value sometimes larger than 1 + # zprob uses upper tail, so Z needs to be positive + return Z, 2 * distributions.norm.sf(np.abs(Z)) + + +def normaltest(a, axis=0): + """ + Tests whether a sample differs from a normal distribution. + + This function tests the null hypothesis that a sample comes + from a normal distribution. It is based on D'Agostino and + Pearson's [1]_, [2]_ test that combines skew and kurtosis to + produce an omnibus test of normality. + + + Parameters + ---------- + a : array_like + The array containing the data to be tested. + axis : int or None + If None, the array is treated as a single data set, regardless of + its shape. Otherwise, each 1-d array along axis `axis` is tested. + + Returns + ------- + k2 : float or array + `s^2 + k^2`, where `s` is the z-score returned by `skewtest` and + `k` is the z-score returned by `kurtosistest`. + p-value : float or array + A 2-sided chi squared probability for the hypothesis test. + + References + ---------- + .. [1] D'Agostino, R. B. (1971), "An omnibus test of normality for + moderate and large sample size," Biometrika, 58, 341-348 + + .. [2] D'Agostino, R. and Pearson, E. S. (1973), "Testing for + departures from normality," Biometrika, 60, 613-622 + + """ + a, axis = _chk_asarray(a, axis) + s,p = skewtest(a,axis) + k,p = kurtosistest(a,axis) + k2 = s*s + k*k + return k2, chisqprob(k2,2) + + +def jarque_bera(x): + """ + Perform the Jarque-Bera goodness of fit test on sample data. + + The Jarque-Bera test tests whether the sample data has the skewness and + kurtosis matching a normal distribution. + + Note that this test only works for a large enough number of data samples + (>2000) as the test statistic asymptotically has a Chi-squared distribution + with 2 degrees of freedom. + + Parameters + ---------- + x : array_like + Observations of a random variable. + + Returns + ------- + jb_value : float + The test statistic. + p : float + The p-value for the hypothesis test. + + References + ---------- + .. [1] Jarque, C. and Bera, A. (1980) "Efficient tests for normality, + homoscedasticity and serial independence of regression residuals", + 6 Econometric Letters 255-259. + + Examples + -------- + >>> from scipy import stats + >>> np.random.seed(987654321) + >>> x = np.random.normal(0, 1, 100000) + >>> y = np.random.rayleigh(1, 100000) + >>> stats.jarque_bera(x) + (4.7165707989581342, 0.09458225503041906) + >>> stats.jarque_bera(y) + (6713.7098548143422, 0.0) + + """ + x = np.asarray(x) + n = float(x.size) + if n == 0: + raise ValueError('At least one observation is required.') + + mu = x.mean() + diffx = x - mu + skewness = (1 / n * np.sum(diffx**3)) / (1 / n * np.sum(diffx**2))**(3 / 2.) + kurtosis = (1 / n * np.sum(diffx**4)) / (1 / n * np.sum(diffx**2))**2 + jb_value = n / 6 * (skewness**2 + (kurtosis - 3)**2 / 4) + p = 1 - distributions.chi2.cdf(jb_value, 2) + + return jb_value, p + + +##################################### +###### FREQUENCY FUNCTIONS ####### +##################################### + +def itemfreq(a): + """ + Returns a 2-D array of item frequencies. + + Parameters + ---------- + a : (N,) array_like + Input array. + + Returns + ------- + itemfreq : (K, 2) ndarray + A 2-D frequency table. Column 1 contains sorted, unique values from + `a`, column 2 contains their respective counts. + + Examples + -------- + >>> a = np.array([1, 1, 5, 0, 1, 2, 2, 0, 1, 4]) + >>> stats.itemfreq(a) + array([[ 0., 2.], + [ 1., 4.], + [ 2., 2.], + [ 4., 1.], + [ 5., 1.]]) + >>> np.bincount(a) + array([2, 4, 2, 0, 1, 1]) + + >>> stats.itemfreq(a/10.) + array([[ 0. , 2. ], + [ 0.1, 4. ], + [ 0.2, 2. ], + [ 0.4, 1. ], + [ 0.5, 1. ]]) + + """ + items, inv = np.unique(a, return_inverse=True) + freq = np.bincount(inv) + return np.array([items, freq]).T + + +def scoreatpercentile(a, per, limit=(), interpolation_method='fraction', + axis=None): + """ + Calculate the score at a given percentile of the input sequence. + + For example, the score at `per=50` is the median. If the desired quantile + lies between two data points, we interpolate between them, according to + the value of `interpolation`. If the parameter `limit` is provided, it + should be a tuple (lower, upper) of two values. + + Parameters + ---------- + a : array_like + A 1-D array of values from which to extract score. + per : array_like + Percentile(s) at which to extract score. Values should be in range + [0,100]. + limit : tuple, optional + Tuple of two scalars, the lower and upper limits within which to + compute the percentile. Values of `a` outside + this (closed) interval will be ignored. + interpolation : {'fraction', 'lower', 'higher'}, optional + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j` + + - fraction: ``i + (j - i) * fraction`` where ``fraction`` is the + fractional part of the index surrounded by ``i`` and ``j``. + - lower: ``i``. + - higher: ``j``. + + axis : int, optional + Axis along which the percentiles are computed. The default (None) + is to compute the median along a flattened version of the array. + + Returns + ------- + score : float (or sequence of floats) + Score at percentile. + + See Also + -------- + percentileofscore + + Examples + -------- + >>> from scipy import stats + >>> a = np.arange(100) + >>> stats.scoreatpercentile(a, 50) + 49.5 + + """ + # adapted from NumPy's percentile function + a = np.asarray(a) + + if limit: + a = a[(limit[0] <= a) & (a <= limit[1])] + + if per == 0: + return a.min(axis=axis) + elif per == 100: + return a.max(axis=axis) + + sorted = np.sort(a, axis=axis) + if axis is None: + axis = 0 + + return _compute_qth_percentile(sorted, per, interpolation_method, axis) + + +# handle sequence of per's without calling sort multiple times +def _compute_qth_percentile(sorted, per, interpolation_method, axis): + if not np.isscalar(per): + return [_compute_qth_percentile(sorted, i, interpolation_method, axis) + for i in per] + + if (per < 0) or (per > 100): + raise ValueError("percentile must be in the range [0, 100]") + + indexer = [slice(None)] * sorted.ndim + idx = per / 100. * (sorted.shape[axis] - 1) + + if int(idx) != idx: + # round fractional indices according to interpolation method + if interpolation_method == 'lower': + idx = int(np.floor(idx)) + elif interpolation_method == 'higher': + idx = int(np.ceil(idx)) + elif interpolation_method == 'fraction': + pass # keep idx as fraction and interpolate + else: + raise ValueError("interpolation_method can only be 'fraction', " + "'lower' or 'higher'") + + i = int(idx) + if i == idx: + indexer[axis] = slice(i, i + 1) + weights = array(1) + sumval = 1.0 + else: + indexer[axis] = slice(i, i + 2) + j = i + 1 + weights = array([(j - idx), (idx - i)], float) + wshape = [1] * sorted.ndim + wshape[axis] = 2 + weights.shape = wshape + sumval = weights.sum() + + # Use np.add.reduce to coerce data type + return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval + + +def percentileofscore(a, score, kind='rank'): + """ + The percentile rank of a score relative to a list of scores. + + A `percentileofscore` of, for example, 80% means that 80% of the + scores in `a` are below the given score. In the case of gaps or + ties, the exact definition depends on the optional keyword, `kind`. + + Parameters + ---------- + a : array_like + Array of scores to which `score` is compared. + score : int or float + Score that is compared to the elements in `a`. + kind : {'rank', 'weak', 'strict', 'mean'}, optional + This optional parameter specifies the interpretation of the + resulting score: + + - "rank": Average percentage ranking of score. In case of + multiple matches, average the percentage rankings of + all matching scores. + - "weak": This kind corresponds to the definition of a cumulative + distribution function. A percentileofscore of 80% + means that 80% of values are less than or equal + to the provided score. + - "strict": Similar to "weak", except that only values that are + strictly less than the given score are counted. + - "mean": The average of the "weak" and "strict" scores, often used in + testing. See + + http://en.wikipedia.org/wiki/Percentile_rank + + Returns + ------- + pcos : float + Percentile-position of score (0-100) relative to `a`. + + Examples + -------- + Three-quarters of the given values lie below a given score: + + >>> percentileofscore([1, 2, 3, 4], 3) + 75.0 + + With multiple matches, note how the scores of the two matches, 0.6 + and 0.8 respectively, are averaged: + + >>> percentileofscore([1, 2, 3, 3, 4], 3) + 70.0 + + Only 2/5 values are strictly less than 3: + + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='strict') + 40.0 + + But 4/5 values are less than or equal to 3: + + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='weak') + 80.0 + + The average between the weak and the strict scores is + + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='mean') + 60.0 + + """ + a = np.array(a) + n = len(a) + + if kind == 'rank': + if not(np.any(a == score)): + a = np.append(a, score) + a_len = np.array(list(range(len(a)))) + else: + a_len = np.array(list(range(len(a)))) + 1.0 + + a = np.sort(a) + idx = [a == score] + pct = (np.mean(a_len[idx]) / n) * 100.0 + return pct + + elif kind == 'strict': + return sum(a < score) / float(n) * 100 + elif kind == 'weak': + return sum(a <= score) / float(n) * 100 + elif kind == 'mean': + return (sum(a < score) + sum(a <= score)) * 50 / float(n) + else: + raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'") + + +def histogram2(a, bins): + """ + Compute histogram using divisions in bins. + + Count the number of times values from array `a` fall into + numerical ranges defined by `bins`. Range x is given by + bins[x] <= range_x < bins[x+1] where x =0,N and N is the + length of the `bins` array. The last range is given by + bins[N] <= range_N < infinity. Values less than bins[0] are + not included in the histogram. + + Parameters + ---------- + a : array_like of rank 1 + The array of values to be assigned into bins + bins : array_like of rank 1 + Defines the ranges of values to use during histogramming. + + Returns + ------- + histogram2 : ndarray of rank 1 + Each value represents the occurrences for a given bin (range) of + values. + + """ + # comment: probably obsoleted by numpy.histogram() + n = np.searchsorted(np.sort(a), bins) + n = np.concatenate([n, [len(a)]]) + return n[1:]-n[:-1] + + +def histogram(a, numbins=10, defaultlimits=None, weights=None, printextras=False): + """ + Separates the range into several bins and returns the number of instances + in each bin. + + Parameters + ---------- + a : array_like + Array of scores which will be put into bins. + numbins : int, optional + The number of bins to use for the histogram. Default is 10. + defaultlimits : tuple (lower, upper), optional + The lower and upper values for the range of the histogram. + If no value is given, a range slightly larger then the range of the + values in a is used. Specifically ``(a.min() - s, a.max() + s)``, + where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. + weights : array_like, optional + The weights for each value in `a`. Default is None, which gives each + value a weight of 1.0 + printextras : bool, optional + If True, if there are extra points (i.e. the points that fall outside + the bin limits) a warning is raised saying how many of those points + there are. Default is False. + + Returns + ------- + histogram : ndarray + Number of points (or sum of weights) in each bin. + low_range : float + Lowest value of histogram, the lower limit of the first bin. + binsize : float + The size of the bins (all bins have the same size). + extrapoints : int + The number of points outside the range of the histogram. + + See Also + -------- + numpy.histogram + + Notes + ----- + This histogram is based on numpy's histogram but has a larger range by + default if default limits is not set. + + """ + a = np.ravel(a) + if defaultlimits is None: + # no range given, so use values in `a` + data_min = a.min() + data_max = a.max() + # Have bins extend past min and max values slightly + s = (data_max - data_min) / (2. * (numbins - 1.)) + defaultlimits = (data_min - s, data_max + s) + # use numpy's histogram method to compute bins + hist, bin_edges = np.histogram(a, bins=numbins, range=defaultlimits, + weights=weights) + # hist are not always floats, convert to keep with old output + hist = np.array(hist, dtype=float) + # fixed width for bins is assumed, as numpy's histogram gives + # fixed width bins for int values for 'bins' + binsize = bin_edges[1] - bin_edges[0] + # calculate number of extra points + extrapoints = len([v for v in a + if defaultlimits[0] > v or v > defaultlimits[1]]) + if extrapoints > 0 and printextras: + warnings.warn("Points outside given histogram range = %s" + % extrapoints) + return (hist, defaultlimits[0], binsize, extrapoints) + + +def cumfreq(a, numbins=10, defaultreallimits=None, weights=None): + """ + Returns a cumulative frequency histogram, using the histogram function. + + Parameters + ---------- + a : array_like + Input array. + numbins : int, optional + The number of bins to use for the histogram. Default is 10. + defaultlimits : tuple (lower, upper), optional + The lower and upper values for the range of the histogram. + If no value is given, a range slightly larger than the range of the + values in `a` is used. Specifically ``(a.min() - s, a.max() + s)``, + where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. + weights : array_like, optional + The weights for each value in `a`. Default is None, which gives each + value a weight of 1.0 + + Returns + ------- + cumfreq : ndarray + Binned values of cumulative frequency. + lowerreallimit : float + Lower real limit + binsize : float + Width of each bin. + extrapoints : int + Extra points. + + Examples + -------- + >>> x = [1, 4, 2, 1, 3, 1] + >>> cumfreqs, lowlim, binsize, extrapoints = sp.stats.cumfreq(x, numbins=4) + >>> cumfreqs + array([ 3., 4., 5., 6.]) + >>> cumfreqs, lowlim, binsize, extrapoints = \ + ... sp.stats.cumfreq(x, numbins=4, defaultreallimits=(1.5, 5)) + >>> cumfreqs + array([ 1., 2., 3., 3.]) + >>> extrapoints + 3 + + """ + h,l,b,e = histogram(a, numbins, defaultreallimits, weights=weights) + cumhist = np.cumsum(h*1, axis=0) + return cumhist,l,b,e + + +def relfreq(a, numbins=10, defaultreallimits=None, weights=None): + """ + Returns a relative frequency histogram, using the histogram function. + + Parameters + ---------- + a : array_like + Input array. + numbins : int, optional + The number of bins to use for the histogram. Default is 10. + defaultreallimits : tuple (lower, upper), optional + The lower and upper values for the range of the histogram. + If no value is given, a range slightly larger then the range of the + values in a is used. Specifically ``(a.min() - s, a.max() + s)``, + where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. + weights : array_like, optional + The weights for each value in `a`. Default is None, which gives each + value a weight of 1.0 + + Returns + ------- + relfreq : ndarray + Binned values of relative frequency. + lowerreallimit : float + Lower real limit + binsize : float + Width of each bin. + extrapoints : int + Extra points. + + Examples + -------- + >>> a = np.array([1, 4, 2, 1, 3, 1]) + >>> relfreqs, lowlim, binsize, extrapoints = sp.stats.relfreq(a, numbins=4) + >>> relfreqs + array([ 0.5 , 0.16666667, 0.16666667, 0.16666667]) + >>> np.sum(relfreqs) # relative frequencies should add up to 1 + 0.99999999999999989 + + """ + h, l, b, e = histogram(a, numbins, defaultreallimits, weights=weights) + h = np.array(h / float(np.array(a).shape[0])) + return h, l, b, e + + +##################################### +###### VARIABILITY FUNCTIONS ##### +##################################### + +def obrientransform(*args): + """ + Computes the O'Brien transform on input data (any number of arrays). + + Used to test for homogeneity of variance prior to running one-way stats. + Each array in ``*args`` is one level of a factor. + If `f_oneway` is run on the transformed data and found significant, + the variances are unequal. From Maxwell and Delaney [1]_, p.112. + + Parameters + ---------- + args : tuple of array_like + Any number of arrays. + + Returns + ------- + obrientransform : ndarray + Transformed data for use in an ANOVA. The first dimension + of the result corresponds to the sequence of transformed + arrays. If the arrays given are all 1-D of the same length, + the return value is a 2-D array; otherwise it is a 1-D array + of type object, with each element being an ndarray. + + References + ---------- + .. [1] S. E. Maxwell and H. D. Delaney, "Designing Experiments and + Analyzing Data: A Model Comparison Perspective", Wadsworth, 1990. + + Examples + -------- + We'll test the following data sets for differences in their variance. + + >>> x = [10, 11, 13, 9, 7, 12, 12, 9, 10] + >>> y = [13, 21, 5, 10, 8, 14, 10, 12, 7, 15] + + Apply the O'Brien transform to the data. + + >>> tx, ty = obrientransform(x, y) + + Use `scipy.stats.f_oneway` to apply a one-way ANOVA test to the + transformed data. + + >>> from scipy.stats import f_oneway + >>> F, p = f_oneway(tx, ty) + >>> p + 0.1314139477040335 + + If we require that ``p < 0.05`` for significance, we cannot conclude + that the variances are different. + """ + TINY = np.sqrt(np.finfo(float).eps) + + # `arrays` will hold the transformed arguments. + arrays = [] + + for arg in args: + a = np.asarray(arg) + n = len(a) + mu = np.mean(a) + sq = (a - mu)**2 + sumsq = sq.sum() + + # The O'Brien transform. + t = ((n - 1.5) * n * sq - 0.5 * sumsq) / ((n - 1) * (n - 2)) + + # Check that the mean of the transformed data is equal to the + # original variance. + var = sumsq / (n - 1) + if abs(var - np.mean(t)) > TINY: + raise ValueError('Lack of convergence in obrientransform.') + + arrays.append(t) + + # If the arrays are not all the same shape, calling np.array(arrays) + # creates a 1-D array with dtype `object` in numpy 1.6+. In numpy + # 1.5.x, it raises an exception. To work around this, we explicitly + # set the dtype to `object` when the arrays are not all the same shape. + if len(arrays) < 2 or all(x.shape == arrays[0].shape for x in arrays[1:]): + dt = None + else: + dt = object + return np.array(arrays, dtype=dt) + + +def signaltonoise(a, axis=0, ddof=0): + """ + The signal-to-noise ratio of the input data. + + Returns the signal-to-noise ratio of `a`, here defined as the mean + divided by the standard deviation. + + Parameters + ---------- + a : array_like + An array_like object containing the sample data. + axis : int or None, optional + If axis is equal to None, the array is first ravel'd. If axis is an + integer, this is the axis over which to operate. Default is 0. + ddof : int, optional + Degrees of freedom correction for standard deviation. Default is 0. + + Returns + ------- + s2n : ndarray + The mean to standard deviation ratio(s) along `axis`, or 0 where the + standard deviation is 0. + + """ + a = np.asanyarray(a) + m = a.mean(axis) + sd = a.std(axis=axis, ddof=ddof) + return np.where(sd == 0, 0, m/sd) + + +def sem(a, axis=0, ddof=1): + """ + Calculates the standard error of the mean (or standard error of + measurement) of the values in the input array. + + Parameters + ---------- + a : array_like + An array containing the values for which the standard error is + returned. + axis : int or None, optional. + If axis is None, ravel `a` first. If axis is an integer, this will be + the axis over which to operate. Defaults to 0. + ddof : int, optional + Delta degrees-of-freedom. How many degrees of freedom to adjust + for bias in limited samples relative to the population estimate + of variance. Defaults to 1. + + Returns + ------- + s : ndarray or float + The standard error of the mean in the sample(s), along the input axis. + + Notes + ----- + The default value for `ddof` is different to the default (0) used by other + ddof containing routines, such as np.std nd stats.nanstd. + + Examples + -------- + Find standard error along the first axis: + + >>> from scipy import stats + >>> a = np.arange(20).reshape(5,4) + >>> stats.sem(a) + array([ 2.8284, 2.8284, 2.8284, 2.8284]) + + Find standard error across the whole array, using n degrees of freedom: + + >>> stats.sem(a, axis=None, ddof=0) + 1.2893796958227628 + + """ + a, axis = _chk_asarray(a, axis) + n = a.shape[axis] + s = np.std(a,axis=axis, ddof=ddof) / np.sqrt(n) # JP check normalization + return s + + +def zscore(a, axis=0, ddof=0): + """ + Calculates the z score of each value in the sample, relative to the sample + mean and standard deviation. + + Parameters + ---------- + a : array_like + An array like object containing the sample data. + axis : int or None, optional + If `axis` is equal to None, the array is first raveled. If `axis` is + an integer, this is the axis over which to operate. Default is 0. + ddof : int, optional + Degrees of freedom correction in the calculation of the + standard deviation. Default is 0. + + Returns + ------- + zscore : array_like + The z-scores, standardized by mean and standard deviation of input + array `a`. + + Notes + ----- + This function preserves ndarray subclasses, and works also with + matrices and masked arrays (it uses `asanyarray` instead of `asarray` + for parameters). + + Examples + -------- + >>> a = np.array([ 0.7972, 0.0767, 0.4383, 0.7866, 0.8091, 0.1954, + 0.6307, 0.6599, 0.1065, 0.0508]) + >>> from scipy import stats + >>> stats.zscore(a) + array([ 1.1273, -1.247 , -0.0552, 1.0923, 1.1664, -0.8559, 0.5786, + 0.6748, -1.1488, -1.3324]) + + Computing along a specified axis, using n-1 degrees of freedom (``ddof=1``) + to calculate the standard deviation: + + >>> b = np.array([[ 0.3148, 0.0478, 0.6243, 0.4608], + [ 0.7149, 0.0775, 0.6072, 0.9656], + [ 0.6341, 0.1403, 0.9759, 0.4064], + [ 0.5918, 0.6948, 0.904 , 0.3721], + [ 0.0921, 0.2481, 0.1188, 0.1366]]) + >>> stats.zscore(b, axis=1, ddof=1) + array([[-0.19264823, -1.28415119, 1.07259584, 0.40420358], + [ 0.33048416, -1.37380874, 0.04251374, 1.00081084], + [ 0.26796377, -1.12598418, 1.23283094, -0.37481053], + [-0.22095197, 0.24468594, 1.19042819, -1.21416216], + [-0.82780366, 1.4457416 , -0.43867764, -0.1792603 ]]) + """ + a = np.asanyarray(a) + mns = a.mean(axis=axis) + sstd = a.std(axis=axis, ddof=ddof) + if axis and mns.ndim < a.ndim: + return ((a - np.expand_dims(mns, axis=axis)) / + np.expand_dims(sstd,axis=axis)) + else: + return (a - mns) / sstd + + +def zmap(scores, compare, axis=0, ddof=0): + """ + Calculates the relative z-scores. + + Returns an array of z-scores, i.e., scores that are standardized to zero + mean and unit variance, where mean and variance are calculated from the + comparison array. + + Parameters + ---------- + scores : array_like + The input for which z-scores are calculated. + compare : array_like + The input from which the mean and standard deviation of the + normalization are taken; assumed to have the same dimension as + `scores`. + axis : int or None, optional + Axis over which mean and variance of `compare` are calculated. + Default is 0. + ddof : int, optional + Degrees of freedom correction in the calculation of the + standard deviation. Default is 0. + + Returns + ------- + zscore : array_like + Z-scores, in the same shape as `scores`. + + Notes + ----- + This function preserves ndarray subclasses, and works also with + matrices and masked arrays (it uses `asanyarray` instead of `asarray` + for parameters). + + Examples + -------- + >>> a = [0.5, 2.0, 2.5, 3] + >>> b = [0, 1, 2, 3, 4] + >>> zmap(a, b) + array([-1.06066017, 0. , 0.35355339, 0.70710678]) + """ + scores, compare = map(np.asanyarray, [scores, compare]) + mns = compare.mean(axis=axis) + sstd = compare.std(axis=axis, ddof=ddof) + if axis and mns.ndim < compare.ndim: + return ((scores - np.expand_dims(mns, axis=axis)) / + np.expand_dims(sstd,axis=axis)) + else: + return (scores - mns) / sstd + + +##################################### +####### TRIMMING FUNCTIONS ####### +##################################### + +def threshold(a, threshmin=None, threshmax=None, newval=0): + """ + Clip array to a given value. + + Similar to numpy.clip(), except that values less than `threshmin` or + greater than `threshmax` are replaced by `newval`, instead of by + `threshmin` and `threshmax` respectively. + + Parameters + ---------- + a : array_like + Data to threshold. + threshmin : float, int or None, optional + Minimum threshold, defaults to None. + threshmax : float, int or None, optional + Maximum threshold, defaults to None. + newval : float or int, optional + Value to put in place of values in `a` outside of bounds. + Defaults to 0. + + Returns + ------- + out : ndarray + The clipped input array, with values less than `threshmin` or + greater than `threshmax` replaced with `newval`. + + Examples + -------- + >>> a = np.array([9, 9, 6, 3, 1, 6, 1, 0, 0, 8]) + >>> from scipy import stats + >>> stats.threshold(a, threshmin=2, threshmax=8, newval=-1) + array([-1, -1, 6, 3, -1, 6, -1, -1, -1, 8]) + + """ + a = asarray(a).copy() + mask = zeros(a.shape, dtype=bool) + if threshmin is not None: + mask |= (a < threshmin) + if threshmax is not None: + mask |= (a > threshmax) + a[mask] = newval + return a + + +def sigmaclip(a, low=4., high=4.): + """ + Iterative sigma-clipping of array elements. + + The output array contains only those elements of the input array `c` + that satisfy the conditions :: + + mean(c) - std(c)*low < c < mean(c) + std(c)*high + + Starting from the full sample, all elements outside the critical range are + removed. The iteration continues with a new critical range until no + elements are outside the range. + + Parameters + ---------- + a : array_like + Data array, will be raveled if not 1-D. + low : float, optional + Lower bound factor of sigma clipping. Default is 4. + high : float, optional + Upper bound factor of sigma clipping. Default is 4. + + Returns + ------- + c : ndarray + Input array with clipped elements removed. + critlower : float + Lower threshold value use for clipping. + critlupper : float + Upper threshold value use for clipping. + + Examples + -------- + >>> a = np.concatenate((np.linspace(9.5,10.5,31), np.linspace(0,20,5))) + >>> fact = 1.5 + >>> c, low, upp = sigmaclip(a, fact, fact) + >>> c + array([ 9.96666667, 10. , 10.03333333, 10. ]) + >>> c.var(), c.std() + (0.00055555555555555165, 0.023570226039551501) + >>> low, c.mean() - fact*c.std(), c.min() + (9.9646446609406727, 9.9646446609406727, 9.9666666666666668) + >>> upp, c.mean() + fact*c.std(), c.max() + (10.035355339059327, 10.035355339059327, 10.033333333333333) + + >>> a = np.concatenate((np.linspace(9.5,10.5,11), + np.linspace(-100,-50,3))) + >>> c, low, upp = sigmaclip(a, 1.8, 1.8) + >>> (c == np.linspace(9.5,10.5,11)).all() + True + + """ + c = np.asarray(a).ravel() + delta = 1 + while delta: + c_std = c.std() + c_mean = c.mean() + size = c.size + critlower = c_mean - c_std*low + critupper = c_mean + c_std*high + c = c[(c > critlower) & (c < critupper)] + delta = size-c.size + return c, critlower, critupper + + +def trimboth(a, proportiontocut, axis=0): + """ + Slices off a proportion of items from both ends of an array. + + Slices off the passed proportion of items from both ends of the passed + array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and** + rightmost 10% of scores). You must pre-sort the array if you want + 'proper' trimming. Slices off less if proportion results in a + non-integer slice index (i.e., conservatively slices off + `proportiontocut`). + + Parameters + ---------- + a : array_like + Data to trim. + proportiontocut : float + Proportion (in range 0-1) of total data set to trim of each end. + axis : int or None, optional + Axis along which the observations are trimmed. The default is to trim + along axis=0. If axis is None then the array will be flattened before + trimming. + + Returns + ------- + out : ndarray + Trimmed version of array `a`. + + See Also + -------- + trim_mean + + Examples + -------- + >>> from scipy import stats + >>> a = np.arange(20) + >>> b = stats.trimboth(a, 0.1) + >>> b.shape + (16,) + + """ + a = np.asarray(a) + if axis is None: + a = a.ravel() + axis = 0 + + nobs = a.shape[axis] + lowercut = int(proportiontocut * nobs) + uppercut = nobs - lowercut + if (lowercut >= uppercut): + raise ValueError("Proportion too big.") + + sl = [slice(None)] * a.ndim + sl[axis] = slice(lowercut, uppercut) + return a[sl] + + +def trim1(a, proportiontocut, tail='right'): + """ + Slices off a proportion of items from ONE end of the passed array + distribution. + + If `proportiontocut` = 0.1, slices off 'leftmost' or 'rightmost' + 10% of scores. Slices off LESS if proportion results in a non-integer + slice index (i.e., conservatively slices off `proportiontocut` ). + + Parameters + ---------- + a : array_like + Input array + proportiontocut : float + Fraction to cut off of 'left' or 'right' of distribution + tail : {'left', 'right'}, optional + Defaults to 'right'. + + Returns + ------- + trim1 : ndarray + Trimmed version of array `a` + + """ + a = asarray(a) + if tail.lower() == 'right': + lowercut = 0 + uppercut = len(a) - int(proportiontocut*len(a)) + elif tail.lower() == 'left': + lowercut = int(proportiontocut*len(a)) + uppercut = len(a) + + return a[lowercut:uppercut] + + +def trim_mean(a, proportiontocut, axis=0): + """ + Return mean of array after trimming distribution from both lower and upper + tails. + + If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of + scores. Slices off LESS if proportion results in a non-integer slice + index (i.e., conservatively slices off `proportiontocut` ). + + Parameters + ---------- + a : array_like + Input array + proportiontocut : float + Fraction to cut off of both tails of the distribution + axis : int or None, optional + Axis along which the trimmed means are computed. The default is axis=0. + If axis is None then the trimmed mean will be computed for the + flattened array. + + Returns + ------- + trim_mean : ndarray + Mean of trimmed array. + + See Also + -------- + trimboth + + Examples + -------- + >>> from scipy import stats + >>> x = np.arange(20) + >>> stats.trim_mean(x, 0.1) + 9.5 + >>> x2 = x.reshape(5, 4) + >>> x2 + array([[ 0, 1, 2, 3], + [ 4, 5, 6, 7], + [ 8, 9, 10, 11], + [12, 13, 14, 15], + [16, 17, 18, 19]]) + >>> stats.trim_mean(x2, 0.25) + array([ 8., 9., 10., 11.]) + >>> stats.trim_mean(x2, 0.25, axis=1) + array([ 1.5, 5.5, 9.5, 13.5, 17.5]) + + """ + a = np.asarray(a) + if axis is None: + nobs = a.size + else: + nobs = a.shape[axis] + lowercut = int(proportiontocut * nobs) + uppercut = nobs - lowercut - 1 + if (lowercut > uppercut): + raise ValueError("Proportion too big.") + + try: + atmp = np.partition(a, (lowercut, uppercut), axis) + except AttributeError: + atmp = np.sort(a, axis) + + newa = trimboth(atmp, proportiontocut, axis=axis) + return np.mean(newa, axis=axis) + + +def f_oneway(*args): + """ + Performs a 1-way ANOVA. + + The one-way ANOVA tests the null hypothesis that two or more groups have + the same population mean. The test is applied to samples from two or + more groups, possibly with differing sizes. + + Parameters + ---------- + sample1, sample2, ... : array_like + The sample measurements for each group. + + Returns + ------- + F-value : float + The computed F-value of the test. + p-value : float + The associated p-value from the F-distribution. + + Notes + ----- + The ANOVA test has important assumptions that must be satisfied in order + for the associated p-value to be valid. + + 1. The samples are independent. + 2. Each sample is from a normally distributed population. + 3. The population standard deviations of the groups are all equal. This + property is known as homoscedasticity. + + If these assumptions are not true for a given set of data, it may still be + possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`) although + with some loss of power. + + The algorithm is from Heiman[2], pp.394-7. + + + References + ---------- + .. [1] Lowry, Richard. "Concepts and Applications of Inferential + Statistics". Chapter 14. + http://faculty.vassar.edu/lowry/ch14pt1.html + + .. [2] Heiman, G.W. Research Methods in Statistics. 2002. + + """ + args = list(map(np.asarray, args)) # convert to an numpy array + na = len(args) # ANOVA on 'na' groups, each in it's own array + alldata = np.concatenate(args) + bign = len(alldata) + sstot = ss(alldata) - (square_of_sums(alldata) / float(bign)) + ssbn = 0 + for a in args: + ssbn += square_of_sums(a) / float(len(a)) + ssbn -= (square_of_sums(alldata) / float(bign)) + sswn = sstot - ssbn + dfbn = na - 1 + dfwn = bign - na + msb = ssbn / float(dfbn) + msw = sswn / float(dfwn) + f = msb / msw + prob = fprob(dfbn, dfwn, f) + return f, prob + + +def pearsonr(x, y): + """ + Calculates a Pearson correlation coefficient and the p-value for testing + non-correlation. + + The Pearson correlation coefficient measures the linear relationship + between two datasets. Strictly speaking, Pearson's correlation requires + that each dataset be normally distributed. Like other correlation + coefficients, this one varies between -1 and +1 with 0 implying no + correlation. Correlations of -1 or +1 imply an exact linear + relationship. Positive correlations imply that as x increases, so does + y. Negative correlations imply that as x increases, y decreases. + + The p-value roughly indicates the probability of an uncorrelated system + producing datasets that have a Pearson correlation at least as extreme + as the one computed from these datasets. The p-values are not entirely + reliable but are probably reasonable for datasets larger than 500 or so. + + Parameters + ---------- + x : (N,) array_like + Input + y : (N,) array_like + Input + + Returns + ------- + (Pearson's correlation coefficient, + 2-tailed p-value) + + References + ---------- + http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation + + """ + # x and y should have same length. + x = np.asarray(x) + y = np.asarray(y) + n = len(x) + mx = x.mean() + my = y.mean() + xm, ym = x-mx, y-my + r_num = np.add.reduce(xm * ym) + r_den = np.sqrt(ss(xm) * ss(ym)) + r = r_num / r_den + + # Presumably, if abs(r) > 1, then it is only some small artifact of floating + # point arithmetic. + r = max(min(r, 1.0), -1.0) + df = n-2 + if abs(r) == 1.0: + prob = 0.0 + else: + t_squared = r*r * (df / ((1.0 - r) * (1.0 + r))) + prob = betai(0.5*df, 0.5, df / (df + t_squared)) + return r, prob + + +def fisher_exact(table, alternative='two-sided'): + """Performs a Fisher exact test on a 2x2 contingency table. + + Parameters + ---------- + table : array_like of ints + A 2x2 contingency table. Elements should be non-negative integers. + alternative : {'two-sided', 'less', 'greater'}, optional + Which alternative hypothesis to the null hypothesis the test uses. + Default is 'two-sided'. + + Returns + ------- + oddsratio : float + This is prior odds ratio and not a posterior estimate. + p_value : float + P-value, the probability of obtaining a distribution at least as + extreme as the one that was actually observed, assuming that the + null hypothesis is true. + + See Also + -------- + chi2_contingency : Chi-square test of independence of variables in a + contingency table. + + Notes + ----- + The calculated odds ratio is different from the one R uses. In R language, + this implementation returns the (more common) "unconditional Maximum + Likelihood Estimate", while R uses the "conditional Maximum Likelihood + Estimate". + + For tables with large numbers the (inexact) chi-square test implemented + in the function `chi2_contingency` can also be used. + + Examples + -------- + Say we spend a few days counting whales and sharks in the Atlantic and + Indian oceans. In the Atlantic ocean we find 8 whales and 1 shark, in the + Indian ocean 2 whales and 5 sharks. Then our contingency table is:: + + Atlantic Indian + whales 8 2 + sharks 1 5 + + We use this table to find the p-value: + + >>> oddsratio, pvalue = stats.fisher_exact([[8, 2], [1, 5]]) + >>> pvalue + 0.0349... + + The probability that we would observe this or an even more imbalanced ratio + by chance is about 3.5%. A commonly used significance level is 5%, if we + adopt that we can therefore conclude that our observed imbalance is + statistically significant; whales prefer the Atlantic while sharks prefer + the Indian ocean. + + """ + hypergeom = distributions.hypergeom + c = np.asarray(table, dtype=np.int64) # int32 is not enough for the algorithm + if not c.shape == (2, 2): + raise ValueError("The input `table` must be of shape (2, 2).") + + if np.any(c < 0): + raise ValueError("All values in `table` must be nonnegative.") + + if 0 in c.sum(axis=0) or 0 in c.sum(axis=1): + # If both values in a row or column are zero, the p-value is 1 and + # the odds ratio is NaN. + return np.nan, 1.0 + + if c[1,0] > 0 and c[0,1] > 0: + oddsratio = c[0,0] * c[1,1] / float(c[1,0] * c[0,1]) + else: + oddsratio = np.inf + + n1 = c[0,0] + c[0,1] + n2 = c[1,0] + c[1,1] + n = c[0,0] + c[1,0] + + def binary_search(n, n1, n2, side): + """Binary search for where to begin lower/upper halves in two-sided + test. + """ + if side == "upper": + minval = mode + maxval = n + else: + minval = 0 + maxval = mode + guess = -1 + while maxval - minval > 1: + if maxval == minval + 1 and guess == minval: + guess = maxval + else: + guess = (maxval + minval) // 2 + pguess = hypergeom.pmf(guess, n1 + n2, n1, n) + if side == "upper": + ng = guess - 1 + else: + ng = guess + 1 + if pguess <= pexact and hypergeom.pmf(ng, n1 + n2, n1, n) > pexact: + break + elif pguess < pexact: + maxval = guess + else: + minval = guess + if guess == -1: + guess = minval + if side == "upper": + while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: + guess -= 1 + while hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: + guess += 1 + else: + while hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: + guess += 1 + while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: + guess -= 1 + return guess + + if alternative == 'less': + pvalue = hypergeom.cdf(c[0,0], n1 + n2, n1, n) + elif alternative == 'greater': + # Same formula as the 'less' case, but with the second column. + pvalue = hypergeom.cdf(c[0,1], n1 + n2, n1, c[0,1] + c[1,1]) + elif alternative == 'two-sided': + mode = int(float((n + 1) * (n1 + 1)) / (n1 + n2 + 2)) + pexact = hypergeom.pmf(c[0,0], n1 + n2, n1, n) + pmode = hypergeom.pmf(mode, n1 + n2, n1, n) + + epsilon = 1 - 1e-4 + if float(np.abs(pexact - pmode)) / np.abs(np.max(pexact, pmode)) <= 1 - epsilon: + return oddsratio, 1. + + elif c[0,0] < mode: + plower = hypergeom.cdf(c[0,0], n1 + n2, n1, n) + if hypergeom.pmf(n, n1 + n2, n1, n) > pexact / epsilon: + return oddsratio, plower + + guess = binary_search(n, n1, n2, "upper") + pvalue = plower + hypergeom.sf(guess - 1, n1 + n2, n1, n) + else: + pupper = hypergeom.sf(c[0,0] - 1, n1 + n2, n1, n) + if hypergeom.pmf(0, n1 + n2, n1, n) > pexact / epsilon: + return oddsratio, pupper + + guess = binary_search(n, n1, n2, "lower") + pvalue = pupper + hypergeom.cdf(guess, n1 + n2, n1, n) + else: + msg = "`alternative` should be one of {'two-sided', 'less', 'greater'}" + raise ValueError(msg) + + if pvalue > 1.0: + pvalue = 1.0 + return oddsratio, pvalue + + +def spearmanr(a, b=None, axis=0): + """ + Calculates a Spearman rank-order correlation coefficient and the p-value + to test for non-correlation. + + The Spearman correlation is a nonparametric measure of the monotonicity + of the relationship between two datasets. Unlike the Pearson correlation, + the Spearman correlation does not assume that both datasets are normally + distributed. Like other correlation coefficients, this one varies + between -1 and +1 with 0 implying no correlation. Correlations of -1 or + +1 imply an exact monotonic relationship. Positive correlations imply that + as x increases, so does y. Negative correlations imply that as x + increases, y decreases. + + The p-value roughly indicates the probability of an uncorrelated system + producing datasets that have a Spearman correlation at least as extreme + as the one computed from these datasets. The p-values are not entirely + reliable but are probably reasonable for datasets larger than 500 or so. + + Parameters + ---------- + a, b : 1D or 2D array_like, b is optional + One or two 1-D or 2-D arrays containing multiple variables and + observations. Each column of `a` and `b` represents a variable, and + each row entry a single observation of those variables. See also + `axis`. Both arrays need to have the same length in the `axis` + dimension. + axis : int or None, optional + If axis=0 (default), then each column represents a variable, with + observations in the rows. If axis=0, the relationship is transposed: + each row represents a variable, while the columns contain observations. + If axis=None, then both arrays will be raveled. + + Returns + ------- + rho : float or ndarray (2-D square) + Spearman correlation matrix or correlation coefficient (if only 2 + variables are given as parameters. Correlation matrix is square with + length equal to total number of variables (columns or rows) in a and b + combined. + p-value : float + The two-sided p-value for a hypothesis test whose null hypothesis is + that two sets of data are uncorrelated, has same dimension as rho. + + Notes + ----- + Changes in scipy 0.8.0: rewrite to add tie-handling, and axis. + + References + ---------- + [CRCProbStat2000]_ Section 14.7 + + .. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard + Probability and Statistics Tables and Formulae. Chapman & Hall: New + York. 2000. + + Examples + -------- + >>> spearmanr([1,2,3,4,5],[5,6,7,8,7]) + (0.82078268166812329, 0.088587005313543798) + >>> np.random.seed(1234321) + >>> x2n=np.random.randn(100,2) + >>> y2n=np.random.randn(100,2) + >>> spearmanr(x2n) + (0.059969996999699973, 0.55338590803773591) + >>> spearmanr(x2n[:,0], x2n[:,1]) + (0.059969996999699973, 0.55338590803773591) + >>> rho, pval = spearmanr(x2n,y2n) + >>> rho + array([[ 1. , 0.05997 , 0.18569457, 0.06258626], + [ 0.05997 , 1. , 0.110003 , 0.02534653], + [ 0.18569457, 0.110003 , 1. , 0.03488749], + [ 0.06258626, 0.02534653, 0.03488749, 1. ]]) + >>> pval + array([[ 0. , 0.55338591, 0.06435364, 0.53617935], + [ 0.55338591, 0. , 0.27592895, 0.80234077], + [ 0.06435364, 0.27592895, 0. , 0.73039992], + [ 0.53617935, 0.80234077, 0.73039992, 0. ]]) + >>> rho, pval = spearmanr(x2n.T, y2n.T, axis=1) + >>> rho + array([[ 1. , 0.05997 , 0.18569457, 0.06258626], + [ 0.05997 , 1. , 0.110003 , 0.02534653], + [ 0.18569457, 0.110003 , 1. , 0.03488749], + [ 0.06258626, 0.02534653, 0.03488749, 1. ]]) + >>> spearmanr(x2n, y2n, axis=None) + (0.10816770419260482, 0.1273562188027364) + >>> spearmanr(x2n.ravel(), y2n.ravel()) + (0.10816770419260482, 0.1273562188027364) + + >>> xint = np.random.randint(10,size=(100,2)) + >>> spearmanr(xint) + (0.052760927029710199, 0.60213045837062351) + + """ + a, axisout = _chk_asarray(a, axis) + ar = np.apply_along_axis(rankdata,axisout,a) + + br = None + if not b is None: + b, axisout = _chk_asarray(b, axis) + br = np.apply_along_axis(rankdata,axisout,b) + n = a.shape[axisout] + rs = np.corrcoef(ar,br,rowvar=axisout) + + olderr = np.seterr(divide='ignore') # rs can have elements equal to 1 + try: + t = rs * np.sqrt((n-2) / ((rs+1.0)*(1.0-rs))) + finally: + np.seterr(**olderr) + prob = distributions.t.sf(np.abs(t),n-2)*2 + + if rs.shape == (2,2): + return rs[1,0], prob[1,0] + else: + return rs, prob + + +def pointbiserialr(x, y): + """Calculates a point biserial correlation coefficient and the associated + p-value. + + The point biserial correlation is used to measure the relationship + between a binary variable, x, and a continuous variable, y. Like other + correlation coefficients, this one varies between -1 and +1 with 0 + implying no correlation. Correlations of -1 or +1 imply a determinative + relationship. + + This function uses a shortcut formula but produces the same result as + `pearsonr`. + + Parameters + ---------- + x : array_like of bools + Input array. + y : array_like + Input array. + + Returns + ------- + r : float + R value + p-value : float + 2-tailed p-value + + References + ---------- + http://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient + + Examples + -------- + >>> from scipy import stats + >>> a = np.array([0, 0, 0, 1, 1, 1, 1]) + >>> b = np.arange(7) + >>> stats.pointbiserialr(a, b) + (0.8660254037844386, 0.011724811003954652) + >>> stats.pearsonr(a, b) + (0.86602540378443871, 0.011724811003954626) + >>> np.corrcoef(a, b) + array([[ 1. , 0.8660254], + [ 0.8660254, 1. ]]) + + """ + x = np.asarray(x, dtype=bool) + y = np.asarray(y, dtype=float) + n = len(x) + + # phat is the fraction of x values that are True + phat = x.sum() / float(len(x)) + y0 = y[~x] # y-values where x is False + y1 = y[x] # y-values where x is True + y0m = y0.mean() + y1m = y1.mean() + + # phat - phat**2 is more stable than phat*(1-phat) + rpb = (y1m - y0m) * np.sqrt(phat - phat**2) / y.std() + + df = n-2 + # fixme: see comment about TINY in pearsonr() + TINY = 1e-20 + t = rpb*np.sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY))) + prob = betai(0.5*df, 0.5, df/(df+t*t)) + return rpb, prob + + +def kendalltau(x, y, initial_lexsort=True): + """ + Calculates Kendall's tau, a correlation measure for ordinal data. + + Kendall's tau is a measure of the correspondence between two rankings. + Values close to 1 indicate strong agreement, values close to -1 indicate + strong disagreement. This is the tau-b version of Kendall's tau which + accounts for ties. + + Parameters + ---------- + x, y : array_like + Arrays of rankings, of the same shape. If arrays are not 1-D, they will + be flattened to 1-D. + initial_lexsort : bool, optional + Whether to use lexsort or quicksort as the sorting method for the + initial sort of the inputs. Default is lexsort (True), for which + `kendalltau` is of complexity O(n log(n)). If False, the complexity is + O(n^2), but with a smaller pre-factor (so quicksort may be faster for + small arrays). + + Returns + ------- + Kendall's tau : float + The tau statistic. + p-value : float + The two-sided p-value for a hypothesis test whose null hypothesis is + an absence of association, tau = 0. + + Notes + ----- + The definition of Kendall's tau that is used is:: + + tau = (P - Q) / sqrt((P + Q + T) * (P + Q + U)) + + where P is the number of concordant pairs, Q the number of discordant + pairs, T the number of ties only in `x`, and U the number of ties only in + `y`. If a tie occurs for the same pair in both `x` and `y`, it is not + added to either T or U. + + References + ---------- + W.R. Knight, "A Computer Method for Calculating Kendall's Tau with + Ungrouped Data", Journal of the American Statistical Association, Vol. 61, + No. 314, Part 1, pp. 436-439, 1966. + + Examples + -------- + >>> x1 = [12, 2, 1, 12, 2] + >>> x2 = [1, 4, 7, 1, 0] + >>> tau, p_value = sp.stats.kendalltau(x1, x2) + >>> tau + -0.47140452079103173 + >>> p_value + 0.24821309157521476 + + """ + + x = np.asarray(x).ravel() + y = np.asarray(y).ravel() + n = np.int64(len(x)) + temp = list(range(n)) # support structure used by mergesort + # this closure recursively sorts sections of perm[] by comparing + # elements of y[perm[]] using temp[] as support + # returns the number of swaps required by an equivalent bubble sort + + def mergesort(offs, length): + exchcnt = 0 + if length == 1: + return 0 + if length == 2: + if y[perm[offs]] <= y[perm[offs+1]]: + return 0 + t = perm[offs] + perm[offs] = perm[offs+1] + perm[offs+1] = t + return 1 + length0 = length // 2 + length1 = length - length0 + middle = offs + length0 + exchcnt += mergesort(offs, length0) + exchcnt += mergesort(middle, length1) + if y[perm[middle - 1]] < y[perm[middle]]: + return exchcnt + # merging + i = j = k = 0 + while j < length0 or k < length1: + if k >= length1 or (j < length0 and y[perm[offs + j]] <= + y[perm[middle + k]]): + temp[i] = perm[offs + j] + d = i - j + j += 1 + else: + temp[i] = perm[middle + k] + d = (offs + i) - (middle + k) + k += 1 + if d > 0: + exchcnt += d + i += 1 + perm[offs:offs+length] = temp[0:length] + return exchcnt + + # initial sort on values of x and, if tied, on values of y + if initial_lexsort: + # sort implemented as mergesort, worst case: O(n log(n)) + perm = np.lexsort((y, x)) + else: + # sort implemented as quicksort, 30% faster but with worst case: O(n^2) + perm = list(range(n)) + perm.sort(key=lambda a: (x[a], y[a])) + + # compute joint ties + first = 0 + t = 0 + for i in xrange(1, n): + if x[perm[first]] != x[perm[i]] or y[perm[first]] != y[perm[i]]: + t += ((i - first) * (i - first - 1)) // 2 + first = i + t += ((n - first) * (n - first - 1)) // 2 + + # compute ties in x + first = 0 + u = 0 + for i in xrange(1,n): + if x[perm[first]] != x[perm[i]]: + u += ((i - first) * (i - first - 1)) // 2 + first = i + u += ((n - first) * (n - first - 1)) // 2 + + # count exchanges + exchanges = mergesort(0, n) + # compute ties in y after mergesort with counting + first = 0 + v = 0 + for i in xrange(1,n): + if y[perm[first]] != y[perm[i]]: + v += ((i - first) * (i - first - 1)) // 2 + first = i + v += ((n - first) * (n - first - 1)) // 2 + + tot = (n * (n - 1)) // 2 + if tot == u or tot == v: + return (np.nan, np.nan) # Special case for all ties in both ranks + + # Prevent overflow; equal to np.sqrt((tot - u) * (tot - v)) + denom = np.exp(0.5 * (np.log(tot - u) + np.log(tot - v))) + tau = ((tot - (v + u - t)) - 2.0 * exchanges) / denom + + # what follows reproduces the ending of Gary Strangman's original + # stats.kendalltau() in SciPy + svar = (4.0 * n + 10.0) / (9.0 * n * (n - 1)) + z = tau / np.sqrt(svar) + prob = special.erfc(np.abs(z) / 1.4142136) + + return tau, prob + + +def linregress(x, y=None): + """ + Calculate a regression line + + This computes a least-squares regression for two sets of measurements. + + Parameters + ---------- + x, y : array_like + two sets of measurements. Both arrays should have the same length. + If only x is given (and y=None), then it must be a two-dimensional + array where one dimension has length 2. The two sets of measurements + are then found by splitting the array along the length-2 dimension. + + Returns + ------- + slope : float + slope of the regression line + intercept : float + intercept of the regression line + r-value : float + correlation coefficient + p-value : float + two-sided p-value for a hypothesis test whose null hypothesis is + that the slope is zero. + stderr : float + Standard error of the estimate + + + Examples + -------- + >>> from scipy import stats + >>> import numpy as np + >>> x = np.random.random(10) + >>> y = np.random.random(10) + >>> slope, intercept, r_value, p_value, std_err = stats.linregress(x,y) + + # To get coefficient of determination (r_squared) + + >>> print "r-squared:", r_value**2 + r-squared: 0.15286643777 + + """ + TINY = 1.0e-20 + if y is None: # x is a (2, N) or (N, 2) shaped array_like + x = asarray(x) + if x.shape[0] == 2: + x, y = x + elif x.shape[1] == 2: + x, y = x.T + else: + msg = "If only `x` is given as input, it has to be of shape (2, N) \ + or (N, 2), provided shape was %s" % str(x.shape) + raise ValueError(msg) + else: + x = asarray(x) + y = asarray(y) + n = len(x) + xmean = np.mean(x,None) + ymean = np.mean(y,None) + + # average sum of squares: + ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat + r_num = ssxym + r_den = np.sqrt(ssxm*ssym) + if r_den == 0.0: + r = 0.0 + else: + r = r_num / r_den + # test for numerical error propagation + if (r > 1.0): + r = 1.0 + elif (r < -1.0): + r = -1.0 + + df = n-2 + t = r*np.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY))) + prob = distributions.t.sf(np.abs(t),df)*2 + slope = r_num / ssxm + intercept = ymean - slope*xmean + sterrest = np.sqrt((1-r*r)*ssym / ssxm / df) + return slope, intercept, r, prob, sterrest + + +##################################### +##### INFERENTIAL STATISTICS ##### +##################################### + +def ttest_1samp(a, popmean, axis=0): + """ + Calculates the T-test for the mean of ONE group of scores. + + This is a two-sided test for the null hypothesis that the expected value + (mean) of a sample of independent observations `a` is equal to the given + population mean, `popmean`. + + Parameters + ---------- + a : array_like + sample observation + popmean : float or array_like + expected value in null hypothesis, if array_like than it must have the + same shape as `a` excluding the axis dimension + axis : int, optional, (default axis=0) + Axis can equal None (ravel array first), or an integer (the axis + over which to operate on a). + + Returns + ------- + t : float or array + t-statistic + prob : float or array + two-tailed p-value + + Examples + -------- + >>> from scipy import stats + + >>> np.random.seed(7654567) # fix seed to get the same result + >>> rvs = stats.norm.rvs(loc=5, scale=10, size=(50,2)) + + Test if mean of random sample is equal to true mean, and different mean. + We reject the null hypothesis in the second case and don't reject it in + the first case. + + >>> stats.ttest_1samp(rvs,5.0) + (array([-0.68014479, -0.04323899]), array([ 0.49961383, 0.96568674])) + >>> stats.ttest_1samp(rvs,0.0) + (array([ 2.77025808, 4.11038784]), array([ 0.00789095, 0.00014999])) + + Examples using axis and non-scalar dimension for population mean. + + >>> stats.ttest_1samp(rvs,[5.0,0.0]) + (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04])) + >>> stats.ttest_1samp(rvs.T,[5.0,0.0],axis=1) + (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04])) + >>> stats.ttest_1samp(rvs,[[5.0],[0.0]]) + (array([[-0.68014479, -0.04323899], + [ 2.77025808, 4.11038784]]), array([[ 4.99613833e-01, 9.65686743e-01], + [ 7.89094663e-03, 1.49986458e-04]])) + + """ + a, axis = _chk_asarray(a, axis) + n = a.shape[axis] + df = n - 1 + + d = np.mean(a, axis) - popmean + v = np.var(a, axis, ddof=1) + denom = np.sqrt(v / float(n)) + + t = np.divide(d, denom) + t, prob = _ttest_finish(df, t) + + return t, prob + + +def _ttest_finish(df,t): + """Common code between all 3 t-test functions.""" + prob = distributions.t.sf(np.abs(t), df) * 2 # use np.abs to get upper tail + if t.ndim == 0: + t = t[()] + + return t, prob + + +def ttest_ind(a, b, axis=0, equal_var=True): + """ + Calculates the T-test for the means of TWO INDEPENDENT samples of scores. + + This is a two-sided test for the null hypothesis that 2 independent samples + have identical average (expected) values. This test assumes that the + populations have identical variances. + + Parameters + ---------- + a, b : array_like + The arrays must have the same shape, except in the dimension + corresponding to `axis` (the first, by default). + axis : int, optional + Axis can equal None (ravel array first), or an integer (the axis + over which to operate on a and b). + equal_var : bool, optional + If True (default), perform a standard independent 2 sample test + that assumes equal population variances [1]_. + If False, perform Welch's t-test, which does not assume equal + population variance [2]_. + + .. versionadded:: 0.11.0 + + Returns + ------- + t : float or array + The calculated t-statistic. + prob : float or array + The two-tailed p-value. + + Notes + ----- + We can use this test, if we observe two independent samples from + the same or different population, e.g. exam scores of boys and + girls or of two ethnic groups. The test measures whether the + average (expected) value differs significantly across samples. If + we observe a large p-value, for example larger than 0.05 or 0.1, + then we cannot reject the null hypothesis of identical average scores. + If the p-value is smaller than the threshold, e.g. 1%, 5% or 10%, + then we reject the null hypothesis of equal averages. + + References + ---------- + .. [1] http://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test + + .. [2] http://en.wikipedia.org/wiki/Welch%27s_t_test + + Examples + -------- + >>> from scipy import stats + >>> np.random.seed(12345678) + + Test with sample with identical means: + + >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500) + >>> rvs2 = stats.norm.rvs(loc=5,scale=10,size=500) + >>> stats.ttest_ind(rvs1,rvs2) + (0.26833823296239279, 0.78849443369564776) + >>> stats.ttest_ind(rvs1,rvs2, equal_var = False) + (0.26833823296239279, 0.78849452749500748) + + `ttest_ind` underestimates p for unequal variances: + + >>> rvs3 = stats.norm.rvs(loc=5, scale=20, size=500) + >>> stats.ttest_ind(rvs1, rvs3) + (-0.46580283298287162, 0.64145827413436174) + >>> stats.ttest_ind(rvs1, rvs3, equal_var = False) + (-0.46580283298287162, 0.64149646246569292) + + When n1 != n2, the equal variance t-statistic is no longer equal to the + unequal variance t-statistic: + + >>> rvs4 = stats.norm.rvs(loc=5, scale=20, size=100) + >>> stats.ttest_ind(rvs1, rvs4) + (-0.99882539442782481, 0.3182832709103896) + >>> stats.ttest_ind(rvs1, rvs4, equal_var = False) + (-0.69712570584654099, 0.48716927725402048) + + T-test with different means, variance, and n: + + >>> rvs5 = stats.norm.rvs(loc=8, scale=20, size=100) + >>> stats.ttest_ind(rvs1, rvs5) + (-1.4679669854490653, 0.14263895620529152) + >>> stats.ttest_ind(rvs1, rvs5, equal_var = False) + (-0.94365973617132992, 0.34744170334794122) + + """ + a, b, axis = _chk2_asarray(a, b, axis) + if a.size == 0 or b.size == 0: + return (np.nan, np.nan) + + v1 = np.var(a, axis, ddof=1) + v2 = np.var(b, axis, ddof=1) + n1 = a.shape[axis] + n2 = b.shape[axis] + + if (equal_var): + df = n1 + n2 - 2 + svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / float(df) + denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2)) + else: + vn1 = v1 / n1 + vn2 = v2 / n2 + df = ((vn1 + vn2)**2) / ((vn1**2) / (n1 - 1) + (vn2**2) / (n2 - 1)) + + # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0). + # Hence it doesn't matter what df is as long as it's not NaN. + df = np.where(np.isnan(df), 1, df) + denom = np.sqrt(vn1 + vn2) + + d = np.mean(a, axis) - np.mean(b, axis) + t = np.divide(d, denom) + t, prob = _ttest_finish(df, t) + + return t, prob + + +def ttest_rel(a, b, axis=0): + """ + Calculates the T-test on TWO RELATED samples of scores, a and b. + + This is a two-sided test for the null hypothesis that 2 related or + repeated samples have identical average (expected) values. + + Parameters + ---------- + a, b : array_like + The arrays must have the same shape. + axis : int, optional, (default axis=0) + Axis can equal None (ravel array first), or an integer (the axis + over which to operate on a and b). + + Returns + ------- + t : float or array + t-statistic + prob : float or array + two-tailed p-value + + Notes + ----- + Examples for the use are scores of the same set of student in + different exams, or repeated sampling from the same units. The + test measures whether the average score differs significantly + across samples (e.g. exams). If we observe a large p-value, for + example greater than 0.05 or 0.1 then we cannot reject the null + hypothesis of identical average scores. If the p-value is smaller + than the threshold, e.g. 1%, 5% or 10%, then we reject the null + hypothesis of equal averages. Small p-values are associated with + large t-statistics. + + References + ---------- + http://en.wikipedia.org/wiki/T-test#Dependent_t-test + + Examples + -------- + >>> from scipy import stats + >>> np.random.seed(12345678) # fix random seed to get same numbers + + >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500) + >>> rvs2 = (stats.norm.rvs(loc=5,scale=10,size=500) + + ... stats.norm.rvs(scale=0.2,size=500)) + >>> stats.ttest_rel(rvs1,rvs2) + (0.24101764965300962, 0.80964043445811562) + >>> rvs3 = (stats.norm.rvs(loc=8,scale=10,size=500) + + ... stats.norm.rvs(scale=0.2,size=500)) + >>> stats.ttest_rel(rvs1,rvs3) + (-3.9995108708727933, 7.3082402191726459e-005) + + """ + a, b, axis = _chk2_asarray(a, b, axis) + if a.shape[axis] != b.shape[axis]: + raise ValueError('unequal length arrays') + + if a.size == 0 or b.size == 0: + return (np.nan, np.nan) + + n = a.shape[axis] + df = float(n - 1) + + d = (a - b).astype(np.float64) + v = np.var(d, axis, ddof=1) + dm = np.mean(d, axis) + denom = np.sqrt(v / float(n)) + + t = np.divide(dm, denom) + t, prob = _ttest_finish(df, t) + + return t, prob + + +def kstest(rvs, cdf, args=(), N=20, alternative='two-sided', mode='approx'): + """ + Perform the Kolmogorov-Smirnov test for goodness of fit. + + This performs a test of the distribution G(x) of an observed + random variable against a given distribution F(x). Under the null + hypothesis the two distributions are identical, G(x)=F(x). The + alternative hypothesis can be either 'two-sided' (default), 'less' + or 'greater'. The KS test is only valid for continuous distributions. + + Parameters + ---------- + rvs : str, array or callable + If a string, it should be the name of a distribution in `scipy.stats`. + If an array, it should be a 1-D array of observations of random + variables. + If a callable, it should be a function to generate random variables; + it is required to have a keyword argument `size`. + cdf : str or callable + If a string, it should be the name of a distribution in `scipy.stats`. + If `rvs` is a string then `cdf` can be False or the same as `rvs`. + If a callable, that callable is used to calculate the cdf. + args : tuple, sequence, optional + Distribution parameters, used if `rvs` or `cdf` are strings. + N : int, optional + Sample size if `rvs` is string or callable. Default is 20. + alternative : {'two-sided', 'less','greater'}, optional + Defines the alternative hypothesis (see explanation above). + Default is 'two-sided'. + mode : 'approx' (default) or 'asymp', optional + Defines the distribution used for calculating the p-value. + + - 'approx' : use approximation to exact distribution of test statistic + - 'asymp' : use asymptotic distribution of test statistic + + Returns + ------- + D : float + KS test statistic, either D, D+ or D-. + p-value : float + One-tailed or two-tailed p-value. + + Notes + ----- + In the one-sided test, the alternative is that the empirical + cumulative distribution function of the random variable is "less" + or "greater" than the cumulative distribution function F(x) of the + hypothesis, ``G(x)<=F(x)``, resp. ``G(x)>=F(x)``. + + Examples + -------- + >>> from scipy import stats + + >>> x = np.linspace(-15, 15, 9) + >>> stats.kstest(x, 'norm') + (0.44435602715924361, 0.038850142705171065) + + >>> np.random.seed(987654321) # set random seed to get the same result + >>> stats.kstest('norm', False, N=100) + (0.058352892479417884, 0.88531190944151261) + + The above lines are equivalent to: + + >>> np.random.seed(987654321) + >>> stats.kstest(stats.norm.rvs(size=100), 'norm') + (0.058352892479417884, 0.88531190944151261) + + *Test against one-sided alternative hypothesis* + + Shift distribution to larger values, so that ``cdf_dgp(x) < norm.cdf(x)``: + + >>> np.random.seed(987654321) + >>> x = stats.norm.rvs(loc=0.2, size=100) + >>> stats.kstest(x,'norm', alternative = 'less') + (0.12464329735846891, 0.040989164077641749) + + Reject equal distribution against alternative hypothesis: less + + >>> stats.kstest(x,'norm', alternative = 'greater') + (0.0072115233216311081, 0.98531158590396395) + + Don't reject equal distribution against alternative hypothesis: greater + + >>> stats.kstest(x,'norm', mode='asymp') + (0.12464329735846891, 0.08944488871182088) + + *Testing t distributed random variables against normal distribution* + + With 100 degrees of freedom the t distribution looks close to the normal + distribution, and the K-S test does not reject the hypothesis that the + sample came from the normal distribution: + + >>> np.random.seed(987654321) + >>> stats.kstest(stats.t.rvs(100,size=100),'norm') + (0.072018929165471257, 0.67630062862479168) + + With 3 degrees of freedom the t distribution looks sufficiently different + from the normal distribution, that we can reject the hypothesis that the + sample came from the normal distribution at the 10% level: + + >>> np.random.seed(987654321) + >>> stats.kstest(stats.t.rvs(3,size=100),'norm') + (0.131016895759829, 0.058826222555312224) + + """ + if isinstance(rvs, string_types): + if (not cdf) or (cdf == rvs): + cdf = getattr(distributions, rvs).cdf + rvs = getattr(distributions, rvs).rvs + else: + raise AttributeError("if rvs is string, cdf has to be the " + "same distribution") + + if isinstance(cdf, string_types): + cdf = getattr(distributions, cdf).cdf + if callable(rvs): + kwds = {'size':N} + vals = np.sort(rvs(*args,**kwds)) + else: + vals = np.sort(rvs) + N = len(vals) + cdfvals = cdf(vals, *args) + + # to not break compatibility with existing code + if alternative == 'two_sided': + alternative = 'two-sided' + + if alternative in ['two-sided', 'greater']: + Dplus = (np.arange(1.0, N+1)/N - cdfvals).max() + if alternative == 'greater': + return Dplus, distributions.ksone.sf(Dplus,N) + + if alternative in ['two-sided', 'less']: + Dmin = (cdfvals - np.arange(0.0, N)/N).max() + if alternative == 'less': + return Dmin, distributions.ksone.sf(Dmin,N) + + if alternative == 'two-sided': + D = np.max([Dplus,Dmin]) + if mode == 'asymp': + return D, distributions.kstwobign.sf(D*np.sqrt(N)) + if mode == 'approx': + pval_two = distributions.kstwobign.sf(D*np.sqrt(N)) + if N > 2666 or pval_two > 0.80 - N*0.3/1000.0: + return D, distributions.kstwobign.sf(D*np.sqrt(N)) + else: + return D, distributions.ksone.sf(D,N)*2 + + +# Map from names to lambda_ values used in power_divergence(). +_power_div_lambda_names = { + "pearson": 1, + "log-likelihood": 0, + "freeman-tukey": -0.5, + "mod-log-likelihood": -1, + "neyman": -2, + "cressie-read": 2/3, +} + + +def _count(a, axis=None): + """ + Count the number of non-masked elements of an array. + + This function behaves like np.ma.count(), but is much faster + for ndarrays. + """ + if hasattr(a, 'count'): + num = a.count(axis=axis) + if isinstance(num, np.ndarray) and num.ndim == 0: + # In some cases, the `count` method returns a scalar array (e.g. + # np.array(3)), but we want a plain integer. + num = int(num) + else: + if axis is None: + num = a.size + else: + num = a.shape[axis] + return num + + +def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None): + """ + Cressie-Read power divergence statistic and goodness of fit test. + + This function tests the null hypothesis that the categorical data + has the given frequencies, using the Cressie-Read power divergence + statistic. + + Parameters + ---------- + f_obs : array_like + Observed frequencies in each category. + f_exp : array_like, optional + Expected frequencies in each category. By default the categories are + assumed to be equally likely. + ddof : int, optional + "Delta degrees of freedom": adjustment to the degrees of freedom + for the p-value. The p-value is computed using a chi-squared + distribution with ``k - 1 - ddof`` degrees of freedom, where `k` + is the number of observed frequencies. The default value of `ddof` + is 0. + axis : int or None, optional + The axis of the broadcast result of `f_obs` and `f_exp` along which to + apply the test. If axis is None, all values in `f_obs` are treated + as a single data set. Default is 0. + lambda_ : float or str, optional + `lambda_` gives the power in the Cressie-Read power divergence + statistic. The default is 1. For convenience, `lambda_` may be + assigned one of the following strings, in which case the + corresponding numerical value is used:: + + String Value Description + "pearson" 1 Pearson's chi-squared statistic. + In this case, the function is + equivalent to `stats.chisquare`. + "log-likelihood" 0 Log-likelihood ratio. Also known as + the G-test [3]_. + "freeman-tukey" -1/2 Freeman-Tukey statistic. + "mod-log-likelihood" -1 Modified log-likelihood ratio. + "neyman" -2 Neyman's statistic. + "cressie-read" 2/3 The power recommended in [5]_. + + Returns + ------- + stat : float or ndarray + The Cressie-Read power divergence test statistic. The value is + a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D. + p : float or ndarray + The p-value of the test. The value is a float if `ddof` and the + return value `stat` are scalars. + + See Also + -------- + chisquare + + Notes + ----- + This test is invalid when the observed or expected frequencies in each + category are too small. A typical rule is that all of the observed + and expected frequencies should be at least 5. + + When `lambda_` is less than zero, the formula for the statistic involves + dividing by `f_obs`, so a warning or error may be generated if any value + in `f_obs` is 0. + + Similarly, a warning or error may be generated if any value in `f_exp` is + zero when `lambda_` >= 0. + + The default degrees of freedom, k-1, are for the case when no parameters + of the distribution are estimated. If p parameters are estimated by + efficient maximum likelihood then the correct degrees of freedom are + k-1-p. If the parameters are estimated in a different way, then the + dof can be between k-1-p and k-1. However, it is also possible that + the asymptotic distribution is not a chisquare, in which case this + test is not appropriate. + + This function handles masked arrays. If an element of `f_obs` or `f_exp` + is masked, then data at that position is ignored, and does not count + towards the size of the data set. + + .. versionadded:: 0.13.0 + + References + ---------- + .. [1] Lowry, Richard. "Concepts and Applications of Inferential + Statistics". Chapter 8. http://faculty.vassar.edu/lowry/ch8pt1.html + .. [2] "Chi-squared test", http://en.wikipedia.org/wiki/Chi-squared_test + .. [3] "G-test", http://en.wikipedia.org/wiki/G-test + .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and + practice of statistics in biological research", New York: Freeman + (1981) + .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit + Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984), + pp. 440-464. + + Examples + -------- + + (See `chisquare` for more examples.) + + When just `f_obs` is given, it is assumed that the expected frequencies + are uniform and given by the mean of the observed frequencies. Here we + perform a G-test (i.e. use the log-likelihood ratio statistic): + + >>> power_divergence([16, 18, 16, 14, 12, 12], method='log-likelihood') + (2.006573162632538, 0.84823476779463769) + + The expected frequencies can be given with the `f_exp` argument: + + >>> power_divergence([16, 18, 16, 14, 12, 12], + ... f_exp=[16, 16, 16, 16, 16, 8], + ... lambda_='log-likelihood') + (3.5, 0.62338762774958223) + + When `f_obs` is 2-D, by default the test is applied to each column. + + >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T + >>> obs.shape + (6, 2) + >>> power_divergence(obs, lambda_="log-likelihood") + (array([ 2.00657316, 6.77634498]), array([ 0.84823477, 0.23781225])) + + By setting ``axis=None``, the test is applied to all data in the array, + which is equivalent to applying the test to the flattened array. + + >>> power_divergence(obs, axis=None) + (23.31034482758621, 0.015975692534127565) + >>> power_divergence(obs.ravel()) + (23.31034482758621, 0.015975692534127565) + + `ddof` is the change to make to the default degrees of freedom. + + >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1) + (2.0, 0.73575888234288467) + + The calculation of the p-values is done by broadcasting the + test statistic with `ddof`. + + >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=[0,1,2]) + (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ])) + + `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has + shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting + `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared + statistics, we must use ``axis=1``: + + >>> power_divergence([16, 18, 16, 14, 12, 12], + ... f_exp=[[16, 16, 16, 16, 16, 8], + ... [8, 20, 20, 16, 12, 12]], + ... axis=1) + (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) + + """ + # Convert the input argument `lambda_` to a numerical value. + if isinstance(lambda_, string_types): + if lambda_ not in _power_div_lambda_names: + names = repr(list(_power_div_lambda_names.keys()))[1:-1] + raise ValueError("invalid string for lambda_: {0!r}. Valid strings " + "are {1}".format(lambda_, names)) + lambda_ = _power_div_lambda_names[lambda_] + elif lambda_ is None: + lambda_ = 1 + + f_obs = np.asanyarray(f_obs) + + if f_exp is not None: + f_exp = np.atleast_1d(np.asanyarray(f_exp)) + else: + # Compute the equivalent of + # f_exp = f_obs.mean(axis=axis, keepdims=True) + # Older versions of numpy do not have the 'keepdims' argument, so + # we have to do a little work to achieve the same result. + # Ignore 'invalid' errors so the edge case of a data set with length 0 + # is handled without spurious warnings. + with np.errstate(invalid='ignore'): + f_exp = np.atleast_1d(f_obs.mean(axis=axis)) + if axis is not None: + reduced_shape = list(f_obs.shape) + reduced_shape[axis] = 1 + f_exp.shape = reduced_shape + + # `terms` is the array of terms that are summed along `axis` to create + # the test statistic. We use some specialized code for a few special + # cases of lambda_. + if lambda_ == 1: + # Pearson's chi-squared statistic + terms = (f_obs - f_exp)**2 / f_exp + elif lambda_ == 0: + # Log-likelihood ratio (i.e. G-test) + terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp) + elif lambda_ == -1: + # Modified log-likelihood ratio + terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs) + else: + # General Cressie-Read power divergence. + terms = f_obs * ((f_obs / f_exp)**lambda_ - 1) + terms /= 0.5 * lambda_ * (lambda_ + 1) + + stat = terms.sum(axis=axis) + + num_obs = _count(terms, axis=axis) + ddof = asarray(ddof) + p = chisqprob(stat, num_obs - 1 - ddof) + + return stat, p + + +def chisquare(f_obs, f_exp=None, ddof=0, axis=0): + """ + Calculates a one-way chi square test. + + The chi square test tests the null hypothesis that the categorical data + has the given frequencies. + + Parameters + ---------- + f_obs : array_like + Observed frequencies in each category. + f_exp : array_like, optional + Expected frequencies in each category. By default the categories are + assumed to be equally likely. + ddof : int, optional + "Delta degrees of freedom": adjustment to the degrees of freedom + for the p-value. The p-value is computed using a chi-squared + distribution with ``k - 1 - ddof`` degrees of freedom, where `k` + is the number of observed frequencies. The default value of `ddof` + is 0. + axis : int or None, optional + The axis of the broadcast result of `f_obs` and `f_exp` along which to + apply the test. If axis is None, all values in `f_obs` are treated + as a single data set. Default is 0. + + Returns + ------- + chisq : float or ndarray + The chi-squared test statistic. The value is a float if `axis` is + None or `f_obs` and `f_exp` are 1-D. + p : float or ndarray + The p-value of the test. The value is a float if `ddof` and the + return value `chisq` are scalars. + + See Also + -------- + power_divergence + mstats.chisquare + + Notes + ----- + This test is invalid when the observed or expected frequencies in each + category are too small. A typical rule is that all of the observed + and expected frequencies should be at least 5. + + The default degrees of freedom, k-1, are for the case when no parameters + of the distribution are estimated. If p parameters are estimated by + efficient maximum likelihood then the correct degrees of freedom are + k-1-p. If the parameters are estimated in a different way, then the + dof can be between k-1-p and k-1. However, it is also possible that + the asymptotic distribution is not a chisquare, in which case this + test is not appropriate. + + References + ---------- + .. [1] Lowry, Richard. "Concepts and Applications of Inferential + Statistics". Chapter 8. http://faculty.vassar.edu/lowry/ch8pt1.html + .. [2] "Chi-squared test", http://en.wikipedia.org/wiki/Chi-squared_test + + Examples + -------- + When just `f_obs` is given, it is assumed that the expected frequencies + are uniform and given by the mean of the observed frequencies. + + >>> chisquare([16, 18, 16, 14, 12, 12]) + (2.0, 0.84914503608460956) + + With `f_exp` the expected frequencies can be given. + + >>> chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8]) + (3.5, 0.62338762774958223) + + When `f_obs` is 2-D, by default the test is applied to each column. + + >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T + >>> obs.shape + (6, 2) + >>> chisquare(obs) + (array([ 2. , 6.66666667]), array([ 0.84914504, 0.24663415])) + + By setting ``axis=None``, the test is applied to all data in the array, + which is equivalent to applying the test to the flattened array. + + >>> chisquare(obs, axis=None) + (23.31034482758621, 0.015975692534127565) + >>> chisquare(obs.ravel()) + (23.31034482758621, 0.015975692534127565) + + `ddof` is the change to make to the default degrees of freedom. + + >>> chisquare([16, 18, 16, 14, 12, 12], ddof=1) + (2.0, 0.73575888234288467) + + The calculation of the p-values is done by broadcasting the + chi-squared statistic with `ddof`. + + >>> chisquare([16, 18, 16, 14, 12, 12], ddof=[0,1,2]) + (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ])) + + `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has + shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting + `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared + statistics, we use ``axis=1``: + + >>> chisquare([16, 18, 16, 14, 12, 12], + ... f_exp=[[16, 16, 16, 16, 16, 8], [8, 20, 20, 16, 12, 12]], + ... axis=1) + (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) + + """ + return power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis, + lambda_="pearson") + + +def ks_2samp(data1, data2): + """ + Computes the Kolmogorov-Smirnov statistic on 2 samples. + + This is a two-sided test for the null hypothesis that 2 independent samples + are drawn from the same continuous distribution. + + Parameters + ---------- + a, b : sequence of 1-D ndarrays + two arrays of sample observations assumed to be drawn from a continuous + distribution, sample sizes can be different + + Returns + ------- + D : float + KS statistic + p-value : float + two-tailed p-value + + Notes + ----- + This tests whether 2 samples are drawn from the same distribution. Note + that, like in the case of the one-sample K-S test, the distribution is + assumed to be continuous. + + This is the two-sided test, one-sided tests are not implemented. + The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution. + + If the K-S statistic is small or the p-value is high, then we cannot + reject the hypothesis that the distributions of the two samples + are the same. + + Examples + -------- + >>> from scipy import stats + >>> np.random.seed(12345678) #fix random seed to get the same result + >>> n1 = 200 # size of first sample + >>> n2 = 300 # size of second sample + + For a different distribution, we can reject the null hypothesis since the + pvalue is below 1%: + + >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1) + >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5) + >>> stats.ks_2samp(rvs1, rvs2) + (0.20833333333333337, 4.6674975515806989e-005) + + For a slightly different distribution, we cannot reject the null hypothesis + at a 10% or lower alpha since the p-value at 0.144 is higher than 10% + + >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0) + >>> stats.ks_2samp(rvs1, rvs3) + (0.10333333333333333, 0.14498781825751686) + + For an identical distribution, we cannot reject the null hypothesis since + the p-value is high, 41%: + + >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0) + >>> stats.ks_2samp(rvs1, rvs4) + (0.07999999999999996, 0.41126949729859719) + + """ + data1, data2 = map(asarray, (data1, data2)) + n1 = data1.shape[0] + n2 = data2.shape[0] + n1 = len(data1) + n2 = len(data2) + data1 = np.sort(data1) + data2 = np.sort(data2) + data_all = np.concatenate([data1,data2]) + cdf1 = np.searchsorted(data1,data_all,side='right')/(1.0*n1) + cdf2 = (np.searchsorted(data2,data_all,side='right'))/(1.0*n2) + d = np.max(np.absolute(cdf1-cdf2)) + # Note: d absolute not signed distance + en = np.sqrt(n1*n2/float(n1+n2)) + try: + prob = ksprob((en+0.12+0.11/en)*d) + except: + prob = 1.0 + return d, prob + + +def mannwhitneyu(x, y, use_continuity=True): + """ + Computes the Mann-Whitney rank test on samples x and y. + + Parameters + ---------- + x, y : array_like + Array of samples, should be one-dimensional. + use_continuity : bool, optional + Whether a continuity correction (1/2.) should be taken into + account. Default is True. + + Returns + ------- + u : float + The Mann-Whitney statistics. + prob : float + One-sided p-value assuming a asymptotic normal distribution. + + Notes + ----- + Use only when the number of observation in each sample is > 20 and + you have 2 independent samples of ranks. Mann-Whitney U is + significant if the u-obtained is LESS THAN or equal to the critical + value of U. + + This test corrects for ties and by default uses a continuity correction. + The reported p-value is for a one-sided hypothesis, to get the two-sided + p-value multiply the returned p-value by 2. + + """ + x = asarray(x) + y = asarray(y) + n1 = len(x) + n2 = len(y) + ranked = rankdata(np.concatenate((x,y))) + rankx = ranked[0:n1] # get the x-ranks + u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx,axis=0) # calc U for x + u2 = n1*n2 - u1 # remainder is U for y + bigu = max(u1,u2) + smallu = min(u1,u2) + T = tiecorrect(ranked) + if T == 0: + raise ValueError('All numbers are identical in amannwhitneyu') + sd = np.sqrt(T*n1*n2*(n1+n2+1)/12.0) + + if use_continuity: + # normal approximation for prob calc with continuity correction + z = abs((bigu-0.5-n1*n2/2.0) / sd) + else: + z = abs((bigu-n1*n2/2.0) / sd) # normal approximation for prob calc + return smallu, distributions.norm.sf(z) # (1.0 - zprob(z)) + + +def ranksums(x, y): + """ + Compute the Wilcoxon rank-sum statistic for two samples. + + The Wilcoxon rank-sum test tests the null hypothesis that two sets + of measurements are drawn from the same distribution. The alternative + hypothesis is that values in one sample are more likely to be + larger than the values in the other sample. + + This test should be used to compare two samples from continuous + distributions. It does not handle ties between measurements + in x and y. For tie-handling and an optional continuity correction + see `scipy.stats.mannwhitneyu`. + + Parameters + ---------- + x,y : array_like + The data from the two samples + + Returns + ------- + z-statistic : float + The test statistic under the large-sample approximation that the + rank sum statistic is normally distributed + p-value : float + The two-sided p-value of the test + + References + ---------- + .. [1] http://en.wikipedia.org/wiki/Wilcoxon_rank-sum_test + + """ + x,y = map(np.asarray, (x, y)) + n1 = len(x) + n2 = len(y) + alldata = np.concatenate((x,y)) + ranked = rankdata(alldata) + x = ranked[:n1] + y = ranked[n1:] + s = np.sum(x,axis=0) + expected = n1*(n1+n2+1) / 2.0 + z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0) + prob = 2 * distributions.norm.sf(abs(z)) + return z, prob + + +def kruskal(*args): + """ + Compute the Kruskal-Wallis H-test for independent samples + + The Kruskal-Wallis H-test tests the null hypothesis that the population + median of all of the groups are equal. It is a non-parametric version of + ANOVA. The test works on 2 or more independent samples, which may have + different sizes. Note that rejecting the null hypothesis does not + indicate which of the groups differs. Post-hoc comparisons between + groups are required to determine which groups are different. + + Parameters + ---------- + sample1, sample2, ... : array_like + Two or more arrays with the sample measurements can be given as + arguments. + + Returns + ------- + H-statistic : float + The Kruskal-Wallis H statistic, corrected for ties + p-value : float + The p-value for the test using the assumption that H has a chi + square distribution + + Notes + ----- + Due to the assumption that H has a chi square distribution, the number + of samples in each group must not be too small. A typical rule is + that each sample must have at least 5 measurements. + + References + ---------- + .. [1] http://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance + + """ + args = list(map(np.asarray, args)) # convert to a numpy array + na = len(args) # Kruskal-Wallis on 'na' groups, each in it's own array + if na < 2: + raise ValueError("Need at least two groups in stats.kruskal()") + n = np.asarray(list(map(len, args))) + + alldata = np.concatenate(args) + + ranked = rankdata(alldata) # Rank the data + T = tiecorrect(ranked) # Correct for ties + if T == 0: + raise ValueError('All numbers are identical in kruskal') + + # Compute sum^2/n for each group and sum + j = np.insert(np.cumsum(n), 0, 0) + ssbn = 0 + for i in range(na): + ssbn += square_of_sums(ranked[j[i]:j[i+1]]) / float(n[i]) + + totaln = np.sum(n) + h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1) + df = na - 1 + h = h / float(T) + return h, chisqprob(h, df) + + +def friedmanchisquare(*args): + """ + Computes the Friedman test for repeated measurements + + The Friedman test tests the null hypothesis that repeated measurements of + the same individuals have the same distribution. It is often used + to test for consistency among measurements obtained in different ways. + For example, if two measurement techniques are used on the same set of + individuals, the Friedman test can be used to determine if the two + measurement techniques are consistent. + + Parameters + ---------- + measurements1, measurements2, measurements3... : array_like + Arrays of measurements. All of the arrays must have the same number + of elements. At least 3 sets of measurements must be given. + + Returns + ------- + friedman chi-square statistic : float + the test statistic, correcting for ties + p-value : float + the associated p-value assuming that the test statistic has a chi + squared distribution + + Notes + ----- + Due to the assumption that the test statistic has a chi squared + distribution, the p-value is only reliable for n > 10 and more than + 6 repeated measurements. + + References + ---------- + .. [1] http://en.wikipedia.org/wiki/Friedman_test + + """ + k = len(args) + if k < 3: + raise ValueError('\nLess than 3 levels. Friedman test not appropriate.\n') + + n = len(args[0]) + for i in range(1, k): + if len(args[i]) != n: + raise ValueError('Unequal N in friedmanchisquare. Aborting.') + + # Rank data + data = np.vstack(args).T + data = data.astype(float) + for i in range(len(data)): + data[i] = rankdata(data[i]) + + # Handle ties + ties = 0 + for i in range(len(data)): + replist, repnum = find_repeats(array(data[i])) + for t in repnum: + ties += t*(t*t-1) + c = 1 - ties / float(k*(k*k-1)*n) + + ssbn = pysum(pysum(data)**2) + chisq = (12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)) / c + return chisq, chisqprob(chisq,k-1) + + +##################################### +#### PROBABILITY CALCULATIONS #### +##################################### + +zprob = special.ndtr + + +def chisqprob(chisq, df): + """ + Probability value (1-tail) for the Chi^2 probability distribution. + + Broadcasting rules apply. + + Parameters + ---------- + chisq : array_like or float > 0 + + df : array_like or float, probably int >= 1 + + Returns + ------- + chisqprob : ndarray + The area from `chisq` to infinity under the Chi^2 probability + distribution with degrees of freedom `df`. + + """ + return special.chdtrc(df,chisq) + +ksprob = special.kolmogorov +fprob = special.fdtrc + + +def betai(a, b, x): + """ + Returns the incomplete beta function. + + I_x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt) + + where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma + function of a. + + The standard broadcasting rules apply to a, b, and x. + + Parameters + ---------- + a : array_like or float > 0 + + b : array_like or float > 0 + + x : array_like or float + x will be clipped to be no greater than 1.0 . + + Returns + ------- + betai : ndarray + Incomplete beta function. + + """ + x = np.asarray(x) + x = np.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0 + return special.betainc(a, b, x) + + +##################################### +####### ANOVA CALCULATIONS ####### +##################################### + +def f_value_wilks_lambda(ER, EF, dfnum, dfden, a, b): + """Calculation of Wilks lambda F-statistic for multivarite data, per + Maxwell & Delaney p.657. + """ + if isinstance(ER, (int, float)): + ER = array([[ER]]) + if isinstance(EF, (int, float)): + EF = array([[EF]]) + lmbda = linalg.det(EF) / linalg.det(ER) + if (a-1)**2 + (b-1)**2 == 5: + q = 1 + else: + q = np.sqrt(((a-1)**2*(b-1)**2 - 2) / ((a-1)**2 + (b-1)**2 - 5)) + n_um = (1 - lmbda**(1.0/q))*(a-1)*(b-1) + d_en = lmbda**(1.0/q) / (n_um*q - 0.5*(a-1)*(b-1) + 1) + return n_um / d_en + + +def f_value(ER, EF, dfR, dfF): + """ + Returns an F-statistic for a restricted vs. unrestricted model. + + Parameters + ---------- + ER : float + `ER` is the sum of squared residuals for the restricted model + or null hypothesis + + EF : float + `EF` is the sum of squared residuals for the unrestricted model + or alternate hypothesis + + dfR : int + `dfR` is the degrees of freedom in the restricted model + + dfF : int + `dfF` is the degrees of freedom in the unrestricted model + + Returns + ------- + F-statistic : float + + """ + return ((ER-EF)/float(dfR-dfF) / (EF/float(dfF))) + + +def f_value_multivariate(ER, EF, dfnum, dfden): + """ + Returns a multivariate F-statistic. + + Parameters + ---------- + ER : ndarray + Error associated with the null hypothesis (the Restricted model). + From a multivariate F calculation. + EF : ndarray + Error associated with the alternate hypothesis (the Full model) + From a multivariate F calculation. + dfnum : int + Degrees of freedom the Restricted model. + dfden : int + Degrees of freedom associated with the Restricted model. + + Returns + ------- + fstat : float + The computed F-statistic. + + """ + if isinstance(ER, (int, float)): + ER = array([[ER]]) + if isinstance(EF, (int, float)): + EF = array([[EF]]) + n_um = (linalg.det(ER) - linalg.det(EF)) / float(dfnum) + d_en = linalg.det(EF) / float(dfden) + return n_um / d_en + + +##################################### +####### SUPPORT FUNCTIONS ######## +##################################### + +def ss(a, axis=0): + """ + Squares each element of the input array, and returns the sum(s) of that. + + Parameters + ---------- + a : array_like + Input array. + axis : int or None, optional + The axis along which to calculate. If None, use whole array. + Default is 0, i.e. along the first axis. + + Returns + ------- + ss : ndarray + The sum along the given axis for (a**2). + + See also + -------- + square_of_sums : The square(s) of the sum(s) (the opposite of `ss`). + + Examples + -------- + >>> from scipy import stats + >>> a = np.array([1., 2., 5.]) + >>> stats.ss(a) + 30.0 + + And calculating along an axis: + + >>> b = np.array([[1., 2., 5.], [2., 5., 6.]]) + >>> stats.ss(b, axis=1) + array([ 30., 65.]) + + """ + a, axis = _chk_asarray(a, axis) + return np.sum(a*a, axis) + + +def square_of_sums(a, axis=0): + """ + Sums elements of the input array, and returns the square(s) of that sum. + + Parameters + ---------- + a : array_like + Input array. + axis : int or None, optional + If axis is None, ravel `a` first. If `axis` is an integer, this will + be the axis over which to operate. Defaults to 0. + + Returns + ------- + square_of_sums : float or ndarray + The square of the sum over `axis`. + + See also + -------- + ss : The sum of squares (the opposite of `square_of_sums`). + + Examples + -------- + >>> from scipy import stats + >>> a = np.arange(20).reshape(5,4) + >>> stats.square_of_sums(a) + array([ 1600., 2025., 2500., 3025.]) + >>> stats.square_of_sums(a, axis=None) + 36100.0 + + """ + a, axis = _chk_asarray(a, axis) + s = np.sum(a,axis) + if not np.isscalar(s): + return s.astype(float)*s + else: + return float(s)*s + + +def fastsort(a): + """ + Sort an array and provide the argsort. + + Parameters + ---------- + a : array_like + Input array. + + Returns + ------- + fastsort : ndarray of type int + sorted indices into the original array + + """ + # TODO: the wording in the docstring is nonsense. + it = np.argsort(a) + as_ = a[it] + return as_, it diff --git a/pywafo/src/wafo/stats/tests/common_tests.py b/pywafo/src/wafo/stats/tests/common_tests.py new file mode 100644 index 0000000..3b9691c --- /dev/null +++ b/pywafo/src/wafo/stats/tests/common_tests.py @@ -0,0 +1,154 @@ +from __future__ import division, print_function, absolute_import + +import inspect +import warnings + +import numpy as np +import numpy.testing as npt + +#from scipy.lib._version import NumpyVersion +from scipy import stats + + +#NUMPY_BELOW_1_7 = NumpyVersion(np.__version__) < '1.7.0' +NUMPY_BELOW_1_7 =np.__version__ < '1.7.0' + + +def check_normalization(distfn, args, distname): + norm_moment = distfn.moment(0, *args) + npt.assert_allclose(norm_moment, 1.0) + + # this is a temporary plug: either ncf or expect is problematic; + # best be marked as a knownfail, but I've no clue how to do it. + if distname == "ncf": + atol, rtol = 1e-5, 0 + else: + atol, rtol = 1e-7, 1e-7 + + normalization_expect = distfn.expect(lambda x: 1, args=args) + npt.assert_allclose(normalization_expect, 1.0, atol=atol, rtol=rtol, + err_msg=distname, verbose=True) + + normalization_cdf = distfn.cdf(distfn.b, *args) + npt.assert_allclose(normalization_cdf, 1.0) + + +def check_moment(distfn, arg, m, v, msg): + m1 = distfn.moment(1, *arg) + m2 = distfn.moment(2, *arg) + if not np.isinf(m): + npt.assert_almost_equal(m1, m, decimal=10, err_msg=msg + + ' - 1st moment') + else: # or np.isnan(m1), + npt.assert_(np.isinf(m1), + msg + ' - 1st moment -infinite, m1=%s' % str(m1)) + + if not np.isinf(v): + npt.assert_almost_equal(m2 - m1 * m1, v, decimal=10, err_msg=msg + + ' - 2ndt moment') + else: # or np.isnan(m2), + npt.assert_(np.isinf(m2), + msg + ' - 2nd moment -infinite, m2=%s' % str(m2)) + + +def check_mean_expect(distfn, arg, m, msg): + if np.isfinite(m): + m1 = distfn.expect(lambda x: x, arg) + npt.assert_almost_equal(m1, m, decimal=5, err_msg=msg + + ' - 1st moment (expect)') + + +def check_var_expect(distfn, arg, m, v, msg): + if np.isfinite(v): + m2 = distfn.expect(lambda x: x*x, arg) + npt.assert_almost_equal(m2, v + m*m, decimal=5, err_msg=msg + + ' - 2st moment (expect)') + + +def check_skew_expect(distfn, arg, m, v, s, msg): + if np.isfinite(s): + m3e = distfn.expect(lambda x: np.power(x-m, 3), arg) + npt.assert_almost_equal(m3e, s * np.power(v, 1.5), + decimal=5, err_msg=msg + ' - skew') + else: + npt.assert_(np.isnan(s)) + + +def check_kurt_expect(distfn, arg, m, v, k, msg): + if np.isfinite(k): + m4e = distfn.expect(lambda x: np.power(x-m, 4), arg) + npt.assert_allclose(m4e, (k + 3.) * np.power(v, 2), atol=1e-5, rtol=1e-5, + err_msg=msg + ' - kurtosis') + else: + npt.assert_(np.isnan(k)) + + +def check_entropy(distfn, arg, msg): + ent = distfn.entropy(*arg) + npt.assert_(not np.isnan(ent), msg + 'test Entropy is nan') + + +def check_private_entropy(distfn, args, superclass): + # compare a generic _entropy with the distribution-specific implementation + npt.assert_allclose(distfn._entropy(*args), + superclass._entropy(distfn, *args)) + + +def check_edge_support(distfn, args): + # Make sure the x=self.a and self.b are handled correctly. + x = [distfn.a, distfn.b] + if isinstance(distfn, stats.rv_continuous): + npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0]) + npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0]) + + npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0]) + npt.assert_equal(distfn.logsf(x, *args), [0.0, -np.inf]) + + if isinstance(distfn, stats.rv_discrete): + x = [distfn.a - 1, distfn.b] + npt.assert_equal(distfn.ppf([0.0, 1.0], *args), x) + npt.assert_equal(distfn.isf([0.0, 1.0], *args), x[::-1]) + + # out-of-bounds for isf & ppf + npt.assert_(np.isnan(distfn.isf([-1, 2], *args)).all()) + npt.assert_(np.isnan(distfn.ppf([-1, 2], *args)).all()) + + +def check_named_args(distfn, x, shape_args, defaults, meths): + ## Check calling w/ named arguments. + + # check consistency of shapes, numargs and _parse signature + signature = inspect.getargspec(distfn._parse_args) + npt.assert_(signature.varargs is None) + npt.assert_(signature.keywords is None) + npt.assert_(signature.defaults == defaults) + + shape_argnames = signature.args[1:-len(defaults)] # self, a, b, loc=0, scale=1 + if distfn.shapes: + shapes_ = distfn.shapes.replace(',', ' ').split() + else: + shapes_ = '' + npt.assert_(len(shapes_) == distfn.numargs) + npt.assert_(len(shapes_) == len(shape_argnames)) + + # check calling w/ named arguments + shape_args = list(shape_args) + + vals = [meth(x, *shape_args) for meth in meths] + npt.assert_(np.all(np.isfinite(vals))) + + names, a, k = shape_argnames[:], shape_args[:], {} + while names: + k.update({names.pop(): a.pop()}) + v = [meth(x, *a, **k) for meth in meths] + npt.assert_array_equal(vals, v) + if not 'n' in k.keys(): + # `n` is first parameter of moment(), so can't be used as named arg + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + npt.assert_equal(distfn.moment(1, *a, **k), + distfn.moment(1, *shape_args)) + + # unknown arguments should not go through: + k.update({'kaboom': 42}) + npt.assert_raises(TypeError, distfn.cdf, x, **k) diff --git a/pywafo/src/wafo/stats/tests/test_binned_statistic.py b/pywafo/src/wafo/stats/tests/test_binned_statistic.py new file mode 100644 index 0000000..26cc4be --- /dev/null +++ b/pywafo/src/wafo/stats/tests/test_binned_statistic.py @@ -0,0 +1,238 @@ +from __future__ import division, print_function, absolute_import + +import numpy as np +from numpy.testing import assert_array_almost_equal, run_module_suite +from scipy.stats import \ + binned_statistic, binned_statistic_2d, binned_statistic_dd + + +class TestBinnedStatistic(object): + + @classmethod + def setup_class(cls): + np.random.seed(9865) + cls.x = np.random.random(100) + cls.y = np.random.random(100) + cls.v = np.random.random(100) + cls.X = np.random.random((100, 3)) + + def test_1d_count(self): + x = self.x + v = self.v + + count1, edges1, bc = binned_statistic(x, v, 'count', bins=10) + count2, edges2 = np.histogram(x, bins=10) + + assert_array_almost_equal(count1, count2) + assert_array_almost_equal(edges1, edges2) + + def test_1d_sum(self): + x = self.x + v = self.v + + sum1, edges1, bc = binned_statistic(x, v, 'sum', bins=10) + sum2, edges2 = np.histogram(x, bins=10, weights=v) + + assert_array_almost_equal(sum1, sum2) + assert_array_almost_equal(edges1, edges2) + + def test_1d_mean(self): + x = self.x + v = self.v + + stat1, edges1, bc = binned_statistic(x, v, 'mean', bins=10) + stat2, edges2, bc = binned_statistic(x, v, np.mean, bins=10) + + assert_array_almost_equal(stat1, stat2) + assert_array_almost_equal(edges1, edges2) + + def test_1d_std(self): + x = self.x + v = self.v + + stat1, edges1, bc = binned_statistic(x, v, 'std', bins=10) + stat2, edges2, bc = binned_statistic(x, v, np.std, bins=10) + + assert_array_almost_equal(stat1, stat2) + assert_array_almost_equal(edges1, edges2) + + def test_1d_median(self): + x = self.x + v = self.v + + stat1, edges1, bc = binned_statistic(x, v, 'median', bins=10) + stat2, edges2, bc = binned_statistic(x, v, np.median, bins=10) + + assert_array_almost_equal(stat1, stat2) + assert_array_almost_equal(edges1, edges2) + + def test_1d_bincode(self): + x = self.x[:20] + v = self.v[:20] + + count1, edges1, bc = binned_statistic(x, v, 'count', bins=3) + bc2 = np.array([3, 2, 1, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 1, 2, 3, 1, + 1, 2, 1]) + + bcount = [(bc == i).sum() for i in np.unique(bc)] + + assert_array_almost_equal(bc, bc2) + assert_array_almost_equal(bcount, count1) + + def test_1d_range_keyword(self): + # Regression test for gh-3063, range can be (min, max) or [(min, max)] + np.random.seed(9865) + x = np.arange(30) + data = np.random.random(30) + + mean, bins, _ = binned_statistic(x[:15], data[:15]) + mean_range, bins_range, _ = binned_statistic(x, data, range=[(0, 14)]) + mean_range2, bins_range2, _ = binned_statistic(x, data, range=(0, 14)) + + assert_array_almost_equal(mean, mean_range) + assert_array_almost_equal(bins, bins_range) + assert_array_almost_equal(mean, mean_range2) + assert_array_almost_equal(bins, bins_range2) + + def test_2d_count(self): + x = self.x + y = self.y + v = self.v + + count1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'count', bins=5) + count2, binx2, biny2 = np.histogram2d(x, y, bins=5) + + assert_array_almost_equal(count1, count2) + assert_array_almost_equal(binx1, binx2) + assert_array_almost_equal(biny1, biny2) + + def test_2d_sum(self): + x = self.x + y = self.y + v = self.v + + sum1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'sum', bins=5) + sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v) + + assert_array_almost_equal(sum1, sum2) + assert_array_almost_equal(binx1, binx2) + assert_array_almost_equal(biny1, biny2) + + def test_2d_mean(self): + x = self.x + y = self.y + v = self.v + + stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'mean', bins=5) + stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5) + + assert_array_almost_equal(stat1, stat2) + assert_array_almost_equal(binx1, binx2) + assert_array_almost_equal(biny1, biny2) + + def test_2d_std(self): + x = self.x + y = self.y + v = self.v + + stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'std', bins=5) + stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.std, bins=5) + + assert_array_almost_equal(stat1, stat2) + assert_array_almost_equal(binx1, binx2) + assert_array_almost_equal(biny1, biny2) + + def test_2d_median(self): + x = self.x + y = self.y + v = self.v + + stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'median', bins=5) + stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.median, bins=5) + + assert_array_almost_equal(stat1, stat2) + assert_array_almost_equal(binx1, binx2) + assert_array_almost_equal(biny1, biny2) + + def test_2d_bincode(self): + x = self.x[:20] + y = self.y[:20] + v = self.v[:20] + + count1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'count', bins=3) + bc2 = np.array([17, 11, 6, 16, 11, 17, 18, 17, 17, 7, 6, 18, 16, + 6, 11, 16, 6, 6, 11, 8]) + + bcount = [(bc == i).sum() for i in np.unique(bc)] + + assert_array_almost_equal(bc, bc2) + count1adj = count1[count1.nonzero()] + assert_array_almost_equal(bcount, count1adj) + + def test_dd_count(self): + X = self.X + v = self.v + + count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3) + count2, edges2 = np.histogramdd(X, bins=3) + + assert_array_almost_equal(count1, count2) + assert_array_almost_equal(edges1, edges2) + + def test_dd_sum(self): + X = self.X + v = self.v + + sum1, edges1, bc = binned_statistic_dd(X, v, 'sum', bins=3) + sum2, edges2 = np.histogramdd(X, bins=3, weights=v) + + assert_array_almost_equal(sum1, sum2) + assert_array_almost_equal(edges1, edges2) + + def test_dd_mean(self): + X = self.X + v = self.v + + stat1, edges1, bc = binned_statistic_dd(X, v, 'mean', bins=3) + stat2, edges2, bc = binned_statistic_dd(X, v, np.mean, bins=3) + + assert_array_almost_equal(stat1, stat2) + assert_array_almost_equal(edges1, edges2) + + def test_dd_std(self): + X = self.X + v = self.v + + stat1, edges1, bc = binned_statistic_dd(X, v, 'std', bins=3) + stat2, edges2, bc = binned_statistic_dd(X, v, np.std, bins=3) + + assert_array_almost_equal(stat1, stat2) + assert_array_almost_equal(edges1, edges2) + + def test_dd_median(self): + X = self.X + v = self.v + + stat1, edges1, bc = binned_statistic_dd(X, v, 'median', bins=3) + stat2, edges2, bc = binned_statistic_dd(X, v, np.median, bins=3) + + assert_array_almost_equal(stat1, stat2) + assert_array_almost_equal(edges1, edges2) + + def test_dd_bincode(self): + X = self.X[:20] + v = self.v[:20] + + count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3) + bc2 = np.array([63, 33, 86, 83, 88, 67, 57, 33, 42, 41, 82, 83, 92, + 32, 36, 91, 43, 87, 81, 81]) + + bcount = [(bc == i).sum() for i in np.unique(bc)] + + assert_array_almost_equal(bc, bc2) + count1adj = count1[count1.nonzero()] + assert_array_almost_equal(bcount, count1adj) + + +if __name__ == "__main__": + run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_contingency.py b/pywafo/src/wafo/stats/tests/test_contingency.py new file mode 100644 index 0000000..23eee17 --- /dev/null +++ b/pywafo/src/wafo/stats/tests/test_contingency.py @@ -0,0 +1,202 @@ +from __future__ import division, print_function, absolute_import + +import numpy as np +from numpy.testing import (run_module_suite, assert_equal, assert_array_equal, + assert_array_almost_equal, assert_approx_equal, assert_raises, + assert_allclose) +from scipy.special import xlogy +from scipy.stats.contingency import margins, expected_freq, chi2_contingency + + +def test_margins(): + a = np.array([1]) + m = margins(a) + assert_equal(len(m), 1) + m0 = m[0] + assert_array_equal(m0, np.array([1])) + + a = np.array([[1]]) + m0, m1 = margins(a) + expected0 = np.array([[1]]) + expected1 = np.array([[1]]) + assert_array_equal(m0, expected0) + assert_array_equal(m1, expected1) + + a = np.arange(12).reshape(2, 6) + m0, m1 = margins(a) + expected0 = np.array([[15], [51]]) + expected1 = np.array([[6, 8, 10, 12, 14, 16]]) + assert_array_equal(m0, expected0) + assert_array_equal(m1, expected1) + + a = np.arange(24).reshape(2, 3, 4) + m0, m1, m2 = margins(a) + expected0 = np.array([[[66]], [[210]]]) + expected1 = np.array([[[60], [92], [124]]]) + expected2 = np.array([[[60, 66, 72, 78]]]) + assert_array_equal(m0, expected0) + assert_array_equal(m1, expected1) + assert_array_equal(m2, expected2) + + +def test_expected_freq(): + assert_array_equal(expected_freq([1]), np.array([1.0])) + + observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]]) + e = expected_freq(observed) + assert_array_equal(e, np.ones_like(observed)) + + observed = np.array([[10, 10, 20], [20, 20, 20]]) + e = expected_freq(observed) + correct = np.array([[12., 12., 16.], [18., 18., 24.]]) + assert_array_almost_equal(e, correct) + + +def test_chi2_contingency_trivial(): + # Some very simple tests for chi2_contingency. + + # A trivial case + obs = np.array([[1, 2], [1, 2]]) + chi2, p, dof, expected = chi2_contingency(obs, correction=False) + assert_equal(chi2, 0.0) + assert_equal(p, 1.0) + assert_equal(dof, 1) + assert_array_equal(obs, expected) + + # A *really* trivial case: 1-D data. + obs = np.array([1, 2, 3]) + chi2, p, dof, expected = chi2_contingency(obs, correction=False) + assert_equal(chi2, 0.0) + assert_equal(p, 1.0) + assert_equal(dof, 0) + assert_array_equal(obs, expected) + + +def test_chi2_contingency_R(): + # Some test cases that were computed independently, using R. + + Rcode = \ + """ + # Data vector. + data <- c( + 12, 34, 23, 4, 47, 11, + 35, 31, 11, 34, 10, 18, + 12, 32, 9, 18, 13, 19, + 12, 12, 14, 9, 33, 25 + ) + + # Create factor tags:r=rows, c=columns, t=tiers + r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4"))) + c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3"))) + t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2"))) + + # 3-way Chi squared test of independence + s = summary(xtabs(data~r+c+t)) + print(s) + """ + Routput = \ + """ + Call: xtabs(formula = data ~ r + c + t) + Number of cases in table: 478 + Number of factors: 3 + Test for independence of all factors: + Chisq = 102.17, df = 17, p-value = 3.514e-14 + """ + obs = np.array( + [[[12, 34, 23], + [35, 31, 11], + [12, 32, 9], + [12, 12, 14]], + [[4, 47, 11], + [34, 10, 18], + [18, 13, 19], + [9, 33, 25]]]) + chi2, p, dof, expected = chi2_contingency(obs) + assert_approx_equal(chi2, 102.17, significant=5) + assert_approx_equal(p, 3.514e-14, significant=4) + assert_equal(dof, 17) + + Rcode = \ + """ + # Data vector. + data <- c( + # + 12, 17, + 11, 16, + # + 11, 12, + 15, 16, + # + 23, 15, + 30, 22, + # + 14, 17, + 15, 16 + ) + + # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers + r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2"))) + c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2"))) + d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2"))) + t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2"))) + + # 4-way Chi squared test of independence + s = summary(xtabs(data~r+c+d+t)) + print(s) + """ + Routput = \ + """ + Call: xtabs(formula = data ~ r + c + d + t) + Number of cases in table: 262 + Number of factors: 4 + Test for independence of all factors: + Chisq = 8.758, df = 11, p-value = 0.6442 + """ + obs = np.array( + [[[[12, 17], + [11, 16]], + [[11, 12], + [15, 16]]], + [[[23, 15], + [30, 22]], + [[14, 17], + [15, 16]]]]) + chi2, p, dof, expected = chi2_contingency(obs) + assert_approx_equal(chi2, 8.758, significant=4) + assert_approx_equal(p, 0.6442, significant=4) + assert_equal(dof, 11) + + +def test_chi2_contingency_g(): + c = np.array([[15, 60], [15, 90]]) + g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False) + assert_allclose(g, 2*xlogy(c, c/e).sum()) + + g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True) + c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]]) + assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum()) + + c = np.array([[10, 12, 10], [12, 10, 10]]) + g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood') + assert_allclose(g, 2*xlogy(c, c/e).sum()) + + +def test_chi2_contingency_bad_args(): + # Test that "bad" inputs raise a ValueError. + + # Negative value in the array of observed frequencies. + obs = np.array([[-1, 10], [1, 2]]) + assert_raises(ValueError, chi2_contingency, obs) + + # The zeros in this will result in zeros in the array + # of expected frequencies. + obs = np.array([[0, 1], [0, 1]]) + assert_raises(ValueError, chi2_contingency, obs) + + # A degenerate case: `observed` has size 0. + obs = np.empty((0, 8)) + assert_raises(ValueError, chi2_contingency, obs) + + +if __name__ == "__main__": + run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_continuous_basic.py b/pywafo/src/wafo/stats/tests/test_continuous_basic.py index 7a9f71a..d4fe56c 100644 --- a/pywafo/src/wafo/stats/tests/test_continuous_basic.py +++ b/pywafo/src/wafo/stats/tests/test_continuous_basic.py @@ -2,11 +2,15 @@ from __future__ import division, print_function, absolute_import import warnings -import numpy.testing as npt import numpy as np -import nose +import numpy.testing as npt -from wafo import stats +from scipy import integrate +from scipy import stats +from common_tests import (check_normalization, check_moment, check_mean_expect, + check_var_expect, check_skew_expect, check_kurt_expect, + check_entropy, check_private_entropy, NUMPY_BELOW_1_7, + check_edge_support, check_named_args) """ Test all continuous distributions. @@ -17,39 +21,30 @@ distributions so that we can perform further testing of class methods. These tests currently check only/mostly for serious errors and exceptions, not for numerically exact results. - - -TODO: -* make functioning test for skew and kurtosis - still known failures - skip for now - - """ -#currently not used -DECIMAL = 5 # specify the precision of the tests # increased from 0 to 5 -DECIMAL_kurt = 0 +DECIMAL = 5 # specify the precision of the tests # increased from 0 to 5 distcont = [ ['alpha', (3.5704770516650459,)], ['anglit', ()], ['arcsine', ()], ['beta', (2.3098496451481823, 0.62687954300963677)], - ['betaprime', (5, 6)], # avoid unbound error in entropy with (100, 86)], + ['betaprime', (5, 6)], ['bradford', (0.29891359763170633,)], - ['burr', (10.5, 4.3)], #incorrect mean and var for(0.94839838075366045, 4.3820284068855795)], + ['burr', (10.5, 4.3)], ['cauchy', ()], ['chi', (78,)], ['chi2', (55,)], ['cosine', ()], ['dgamma', (1.1023326088288166,)], ['dweibull', (2.0685080649914673,)], - ['erlang', (20,)], #correction numargs = 1 + ['erlang', (10,)], ['expon', ()], ['exponpow', (2.697119160358469,)], ['exponweib', (2.8923945291034436, 1.9505288745913174)], ['f', (29, 18)], - ['fatiguelife', (29,)], #correction numargs = 1 + ['fatiguelife', (29,)], # correction numargs = 1 ['fisk', (3.0857548622253179,)], ['foldcauchy', (4.7164673455831894,)], ['foldnorm', (1.9521253373555869,)], @@ -57,9 +52,9 @@ distcont = [ ['frechet_r', (1.8928171603534227,)], ['gamma', (1.9932305483800778,)], ['gausshyper', (13.763771604130699, 3.1189636648681431, - 2.5145980350183019, 5.1811649903971615)], #veryslow + 2.5145980350183019, 5.1811649903971615)], # veryslow ['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)], - ['genextreme', (-0.1,)], # sample mean test fails for (3.3184017469423535,)], + ['genextreme', (-0.1,)], ['gengamma', (4.4162385429431925, 3.1193091679242761)], ['genhalflogistic', (0.77274727809929322,)], ['genlogistic', (0.41192440799679475,)], @@ -72,12 +67,12 @@ distcont = [ ['halflogistic', ()], ['halfnorm', ()], ['hypsecant', ()], - ['invgamma', (2.0668996136993067,)], + ['invgamma', (4.0668996136993067,)], ['invgauss', (0.14546264555347513,)], - ['invweibull', (10.58,)], # sample mean test fails at(0.58847112119264788,)] + ['invweibull', (10.58,)], ['johnsonsb', (4.3172675099141058, 3.1837781130785063)], ['johnsonsu', (2.554395574161155, 2.2482281679651965)], - ['ksone', (1000,)], #replace 22 by 100 to avoid failing range, ticket 956 + ['ksone', (1000,)], # replace 22 by 100 to avoid failing range, ticket 956 ['kstwobign', ()], ['laplace', ()], ['levy', ()], @@ -91,8 +86,7 @@ distcont = [ ['lognorm', (0.95368226960575331,)], ['lomax', (1.8771398388773268,)], ['maxwell', ()], - ['mielke', (10.4, 3.6)], # sample mean test fails for (4.6420495492121487, 0.59707419545516938)], - # mielke: good results if 2nd parameter >2, weird mean or var below + ['mielke', (10.4, 3.6)], ['nakagami', (4.9673794866666237,)], ['ncf', (27, 27, 0.41578441799226107)], ['nct', (14, 0.24045031331198066)], @@ -105,8 +99,6 @@ distcont = [ ['powernorm', (4.4453652254590779,)], ['rayleigh', ()], ['rdist', (0.9,)], # feels also slow -# ['rdist', (3.8266985793976525,)], #veryslow, especially rvs - #['rdist', (541.0,)], # from ticket #758 #veryslow ['recipinvgauss', (0.63004267809369119,)], ['reciprocal', (0.0062309367010521255, 1.0062309367010522)], ['rice', (0.7749725210111873,)], @@ -115,22 +107,36 @@ distcont = [ ['triang', (0.15785029824528218,)], ['truncexpon', (4.6907725456810478,)], ['truncnorm', (-1.0978730080013919, 2.7306754109031979)], + ['truncnorm', (0.1, 2.)], ['tukeylambda', (3.1321477856738267,)], ['uniform', ()], ['vonmises', (3.9939042581071398,)], + ['vonmises_line', (3.9939042581071398,)], ['wald', ()], ['weibull_max', (2.8687961709100187,)], ['weibull_min', (1.7866166930421596,)], ['wrapcauchy', (0.031071279018614728,)]] +## Last four of these fail all around. Need to be checked +distcont_extra = [ + ['betaprime', (100, 86)], + ['fatiguelife', (5,)], + ['mielke', (4.6420495492121487, 0.59707419545516938)], + ['invweibull', (0.58847112119264788,)], + # burr: sample mean test fails still for c<1 + ['burr', (0.94839838075366045, 4.3820284068855795)], + # genextreme: sample mean test, sf-logsf test fail + ['genextreme', (3.3184017469423535,)], +] + + # for testing only specific functions -##distcont = [ -## ['erlang', (20,)], #correction numargs = 1 +# distcont = [ ## ['fatiguelife', (29,)], #correction numargs = 1 ## ['loggamma', (0.41411931826052117,)]] # for testing ticket:767 -##distcont = [ +# distcont = [ ## ['genextreme', (3.3184017469423535,)], ## ['genextreme', (0.01,)], ## ['genextreme', (0.00001,)], @@ -138,12 +144,12 @@ distcont = [ ## ['genextreme', (-0.01,)] ## ] -##distcont = [['gumbel_l', ()], +# distcont = [['gumbel_l', ()], ## ['gumbel_r', ()], ## ['norm', ()] ## ] -##distcont = [['norm', ()]] +# distcont = [['norm', ()]] distmissing = ['wald', 'gausshyper', 'genexpon', 'rv_continuous', 'loglaplace', 'rdist', 'semicircular', 'invweibull', 'ksone', @@ -154,11 +160,14 @@ distmissing = ['wald', 'gausshyper', 'genexpon', 'rv_continuous', distmiss = [[dist,args] for dist,args in distcont if dist in distmissing] distslow = ['rdist', 'gausshyper', 'recipinvgauss', 'ksone', 'genexpon', - 'vonmises', 'rice', 'mielke', 'semicircular', 'cosine', 'invweibull', - 'powerlognorm', 'johnsonsu', 'kstwobign'] -#distslow are sorted by speed (very slow to slow) + 'vonmises', 'vonmises_line', 'mielke', 'semicircular', + 'cosine', 'invweibull', 'powerlognorm', 'johnsonsu', 'kstwobign'] +# distslow are sorted by speed (very slow to slow) + +# NB: not needed anymore? def _silence_fp_errors(func): + # warning: don't apply to test_ functions as is, then those will be skipped def wrap(*a, **kw): olderr = np.seterr(all='ignore') try: @@ -168,162 +177,183 @@ def _silence_fp_errors(func): wrap.__name__ = func.__name__ return wrap -@_silence_fp_errors + def test_cont_basic(): # this test skips slow distributions - for distname, arg in distcont[:]: - if distname in distslow: - continue - distfn = getattr(stats, distname) - np.random.seed(765456) - sn = 1000 - rvs = distfn.rvs(size=sn,*arg) - sm = rvs.mean() - sv = rvs.var() - skurt = stats.kurtosis(rvs) - sskew = stats.skew(rvs) - m,v = distfn.stats(*arg) - - yield check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn, distname + \ - 'sample mean test' - # the sample skew kurtosis test has known failures, not very good distance measure - #yield check_sample_skew_kurt, distfn, arg, sskew, skurt, distname - yield check_moment, distfn, arg, m, v, distname - yield check_cdf_ppf, distfn, arg, distname - yield check_sf_isf, distfn, arg, distname - yield check_pdf, distfn, arg, distname - if distname in ['wald']: - continue - yield check_pdf_logpdf, distfn, arg, distname - yield check_cdf_logcdf, distfn, arg, distname - yield check_sf_logsf, distfn, arg, distname - if distname in distmissing: - alpha = 0.01 - yield check_distribution_rvs, distname, arg, alpha, rvs + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=integrate.IntegrationWarning) + for distname, arg in distcont[:]: + if distname in distslow: + continue + distfn = getattr(stats, distname) + np.random.seed(765456) + sn = 500 + rvs = distfn.rvs(size=sn, *arg) + sm = rvs.mean() + sv = rvs.var() + m, v = distfn.stats(*arg) + + yield check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn, \ + distname + 'sample mean test' + yield check_cdf_ppf, distfn, arg, distname + yield check_sf_isf, distfn, arg, distname + yield check_pdf, distfn, arg, distname + yield check_pdf_logpdf, distfn, arg, distname + yield check_cdf_logcdf, distfn, arg, distname + yield check_sf_logsf, distfn, arg, distname + if distname in distmissing: + alpha = 0.01 + yield check_distribution_rvs, distname, arg, alpha, rvs + + locscale_defaults = (0, 1) + meths = [distfn.pdf, distfn.logpdf, distfn.cdf, distfn.logcdf, + distfn.logsf] + # make sure arguments are within support + spec_x = {'frechet_l': -0.5, 'weibull_max': -0.5, 'levy_l': -0.5, + 'pareto': 1.5, 'tukeylambda': 0.3} + x = spec_x.get(distname, 0.5) + yield check_named_args, distfn, x, arg, locscale_defaults, meths + + # Entropy + skp = npt.dec.skipif + yield check_entropy, distfn, arg, distname + + if distfn.numargs == 0: + yield skp(NUMPY_BELOW_1_7)(check_vecentropy), distfn, arg + if distfn.__class__._entropy != stats.rv_continuous._entropy: + yield check_private_entropy, distfn, arg, stats.rv_continuous + + yield check_edge_support, distfn, arg + + knf = npt.dec.knownfailureif + yield knf(distname == 'truncnorm')(check_ppf_private), distfn, \ + arg, distname @npt.dec.slow def test_cont_basic_slow(): # same as above for slow distributions - for distname, arg in distcont[:]: - if distname not in distslow: continue - distfn = getattr(stats, distname) - np.random.seed(765456) - sn = 1000 - rvs = distfn.rvs(size=sn,*arg) - sm = rvs.mean() - sv = rvs.var() - skurt = stats.kurtosis(rvs) - sskew = stats.skew(rvs) - m,v = distfn.stats(*arg) - yield check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn, distname + \ - 'sample mean test' - # the sample skew kurtosis test has known failures, not very good distance measure - #yield check_sample_skew_kurt, distfn, arg, sskew, skurt, distname - yield check_moment, distfn, arg, m, v, distname - yield check_cdf_ppf, distfn, arg, distname - yield check_sf_isf, distfn, arg, distname - yield check_pdf, distfn, arg, distname - yield check_pdf_logpdf, distfn, arg, distname - yield check_cdf_logcdf, distfn, arg, distname - yield check_sf_logsf, distfn, arg, distname - #yield check_oth, distfn, arg # is still missing - if distname in distmissing: - alpha = 0.01 - yield check_distribution_rvs, distname, arg, alpha, rvs - -@_silence_fp_errors -def check_moment(distfn, arg, m, v, msg): - m1 = distfn.moment(1,*arg) - m2 = distfn.moment(2,*arg) - if not np.isinf(m): - npt.assert_almost_equal(m1, m, decimal=10, err_msg= msg + \ - ' - 1st moment') - else: # or np.isnan(m1), - npt.assert_(np.isinf(m1), - msg + ' - 1st moment -infinite, m1=%s' % str(m1)) - #np.isnan(m1) temporary special treatment for loggamma - if not np.isinf(v): - npt.assert_almost_equal(m2-m1*m1, v, decimal=10, err_msg= msg + \ - ' - 2ndt moment') - else: #or np.isnan(m2), - npt.assert_(np.isinf(m2), - msg + ' - 2nd moment -infinite, m2=%s' % str(m2)) - #np.isnan(m2) temporary special treatment for loggamma - -@_silence_fp_errors + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=integrate.IntegrationWarning) + for distname, arg in distcont[:]: + if distname not in distslow: + continue + distfn = getattr(stats, distname) + np.random.seed(765456) + sn = 500 + rvs = distfn.rvs(size=sn,*arg) + sm = rvs.mean() + sv = rvs.var() + m, v = distfn.stats(*arg) + yield check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn, \ + distname + 'sample mean test' + yield check_cdf_ppf, distfn, arg, distname + yield check_sf_isf, distfn, arg, distname + yield check_pdf, distfn, arg, distname + yield check_pdf_logpdf, distfn, arg, distname + yield check_cdf_logcdf, distfn, arg, distname + yield check_sf_logsf, distfn, arg, distname + # yield check_oth, distfn, arg # is still missing + if distname in distmissing: + alpha = 0.01 + yield check_distribution_rvs, distname, arg, alpha, rvs + + locscale_defaults = (0, 1) + meths = [distfn.pdf, distfn.logpdf, distfn.cdf, distfn.logcdf, + distfn.logsf] + # make sure arguments are within support + x = 0.5 + if distname == 'invweibull': + arg = (1,) + elif distname == 'ksone': + arg = (3,) + yield check_named_args, distfn, x, arg, locscale_defaults, meths + + # Entropy + skp = npt.dec.skipif + ks_cond = distname in ['ksone', 'kstwobign'] + yield skp(ks_cond)(check_entropy), distfn, arg, distname + + if distfn.numargs == 0: + yield skp(NUMPY_BELOW_1_7)(check_vecentropy), distfn, arg + if distfn.__class__._entropy != stats.rv_continuous._entropy: + yield check_private_entropy, distfn, arg, stats.rv_continuous + + yield check_edge_support, distfn, arg + + +@npt.dec.slow +def test_moments(): + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=integrate.IntegrationWarning) + knf = npt.dec.knownfailureif + fail_normalization = set(['vonmises', 'ksone']) + fail_higher = set(['vonmises', 'ksone', 'ncf']) + for distname, arg in distcont[:]: + distfn = getattr(stats, distname) + m, v, s, k = distfn.stats(*arg, moments='mvsk') + cond1, cond2 = distname in fail_normalization, distname in fail_higher + msg = distname + ' fails moments' + yield knf(cond1, msg)(check_normalization), distfn, arg, distname + yield knf(cond2, msg)(check_mean_expect), distfn, arg, m, distname + yield knf(cond2, msg)(check_var_expect), distfn, arg, m, v, distname + yield knf(cond2, msg)(check_skew_expect), distfn, arg, m, v, s, \ + distname + yield knf(cond2, msg)(check_kurt_expect), distfn, arg, m, v, k, \ + distname + yield check_loc_scale, distfn, arg, m, v, distname + yield check_moment, distfn, arg, m, v, distname + + def check_sample_meanvar_(distfn, arg, m, v, sm, sv, sn, msg): - #this did not work, skipped silently by nose - #check_sample_meanvar, sm, m, msg + 'sample mean test' - #check_sample_meanvar, sv, v, msg + 'sample var test' + # this did not work, skipped silently by nose if not np.isinf(m): check_sample_mean(sm, sv, sn, m) if not np.isinf(v): check_sample_var(sv, sn, v) -## check_sample_meanvar( sm, m, msg + 'sample mean test') -## check_sample_meanvar( sv, v, msg + 'sample var test') -def check_sample_mean(sm,v,n, popmean): - """ -from stats.stats.ttest_1samp(a, popmean): -Calculates the t-obtained for the independent samples T-test on ONE group -of scores a, given a population mean. -Returns: t-value, two-tailed prob -""" -## a = asarray(a) -## x = np.mean(a) -## v = np.var(a, ddof=1) -## n = len(a) +def check_sample_mean(sm,v,n, popmean): + # from stats.stats.ttest_1samp(a, popmean): + # Calculates the t-obtained for the independent samples T-test on ONE group + # of scores a, given a population mean. + # + # Returns: t-value, two-tailed prob df = n-1 - svar = ((n-1)*v) / float(df) #looks redundant - t = (sm-popmean)/np.sqrt(svar*(1.0/n)) - prob = stats.betai(0.5*df,0.5,df/(df+t*t)) + svar = ((n-1)*v) / float(df) # looks redundant + t = (sm-popmean) / np.sqrt(svar*(1.0/n)) + prob = stats.betai(0.5*df, 0.5, df/(df+t*t)) + + # return t,prob + npt.assert_(prob > 0.01, 'mean fail, t,prob = %f, %f, m, sm=%f,%f' % + (t, prob, popmean, sm)) - #return t,prob - npt.assert_(prob > 0.01, 'mean fail, t,prob = %f, %f, m,sm=%f,%f' % (t,prob,popmean,sm)) def check_sample_var(sv,n, popvar): - ''' -two-sided chisquare test for sample variance equal to hypothesized variance - ''' + # two-sided chisquare test for sample variance equal to hypothesized variance df = n-1 chi2 = (n-1)*popvar/float(popvar) pval = stats.chisqprob(chi2,df)*2 - npt.assert_(pval > 0.01, 'var fail, t,pval = %f, %f, v,sv=%f,%f' % (chi2,pval,popvar,sv)) + npt.assert_(pval > 0.01, 'var fail, t, pval = %f, %f, v, sv=%f, %f' % + (chi2,pval,popvar,sv)) -def check_sample_skew_kurt(distfn, arg, ss, sk, msg): - skew,kurt = distfn.stats(moments='sk',*arg) -## skew = distfn.stats(moment='s',*arg)[()] -## kurt = distfn.stats(moment='k',*arg)[()] - check_sample_meanvar( sk, kurt, msg + 'sample kurtosis test') - check_sample_meanvar( ss, skew, msg + 'sample skew test') - -def check_sample_meanvar(sm,m,msg): - if not np.isinf(m) and not np.isnan(m): - npt.assert_almost_equal(sm, m, decimal=DECIMAL, err_msg= msg + \ - ' - finite moment') -## else: -## npt.assert_(abs(sm) > 10000), msg='infinite moment, sm = ' + str(sm)) - -@_silence_fp_errors def check_cdf_ppf(distfn,arg,msg): values = [0.001, 0.5, 0.999] npt.assert_almost_equal(distfn.cdf(distfn.ppf(values, *arg), *arg), - values, decimal=DECIMAL, err_msg= msg + \ + values, decimal=DECIMAL, err_msg=msg + ' - cdf-ppf roundtrip') -@_silence_fp_errors + def check_sf_isf(distfn,arg,msg): npt.assert_almost_equal(distfn.sf(distfn.isf([0.1,0.5,0.9], *arg), *arg), - [0.1,0.5,0.9], decimal=DECIMAL, err_msg= msg + \ + [0.1,0.5,0.9], decimal=DECIMAL, err_msg=msg + ' - sf-isf roundtrip') npt.assert_almost_equal(distfn.cdf([0.1,0.9], *arg), 1.0-distfn.sf([0.1,0.9], *arg), - decimal=DECIMAL, err_msg= msg + \ + decimal=DECIMAL, err_msg=msg + ' - cdf-sf relationship') -@_silence_fp_errors + def check_pdf(distfn, arg, msg): # compares pdf at median with numerical derivative of cdf median = distfn.ppf(0.5, *arg) @@ -335,12 +365,12 @@ def check_pdf(distfn, arg, msg): pdfv = distfn.pdf(median, *arg) cdfdiff = (distfn.cdf(median + eps, *arg) - distfn.cdf(median - eps, *arg))/eps/2.0 - #replace with better diff and better test (more points), - #actually, this works pretty well + # replace with better diff and better test (more points), + # actually, this works pretty well npt.assert_almost_equal(pdfv, cdfdiff, - decimal=DECIMAL, err_msg= msg + ' - cdf-pdf relationship') + decimal=DECIMAL, err_msg=msg + ' - cdf-pdf relationship') + -@_silence_fp_errors def check_pdf_logpdf(distfn, args, msg): # compares pdf at several points with the log of the pdf points = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) @@ -351,7 +381,7 @@ def check_pdf_logpdf(distfn, args, msg): logpdf = logpdf[np.isfinite(logpdf)] npt.assert_almost_equal(np.log(pdf), logpdf, decimal=7, err_msg=msg + " - logpdf-log(pdf) relationship") -@_silence_fp_errors + def check_sf_logsf(distfn, args, msg): # compares sf at several points with the log of the sf points = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) @@ -362,7 +392,7 @@ def check_sf_logsf(distfn, args, msg): logsf = logsf[np.isfinite(logsf)] npt.assert_almost_equal(np.log(sf), logsf, decimal=7, err_msg=msg + " - logsf-log(sf) relationship") -@_silence_fp_errors + def check_cdf_logcdf(distfn, args, msg): # compares cdf at several points with the log of the cdf points = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) @@ -374,10 +404,9 @@ def check_cdf_logcdf(distfn, args, msg): npt.assert_almost_equal(np.log(cdf), logcdf, decimal=7, err_msg=msg + " - logcdf-log(cdf) relationship") -@_silence_fp_errors def check_distribution_rvs(dist, args, alpha, rvs): - #test from scipy.stats.tests - #this version reuses existing random variables + # test from scipy.stats.tests + # this version reuses existing random variables D,pval = stats.kstest(rvs, dist, args=args, N=1000) if (pval < alpha): D,pval = stats.kstest(dist,'',args=args, N=1000) @@ -385,6 +414,22 @@ def check_distribution_rvs(dist, args, alpha, rvs): "; alpha = " + str(alpha) + "\nargs = " + str(args)) +def check_vecentropy(distfn, args): + npt.assert_equal(distfn.vecentropy(*args), distfn._entropy(*args)) + +@npt.dec.skipif(NUMPY_BELOW_1_7) +def check_loc_scale(distfn, arg, m, v, msg): + loc, scale = 10.0, 10.0 + mt, vt = distfn.stats(loc=loc, scale=scale, *arg) + npt.assert_allclose(m*scale + loc, mt) + npt.assert_allclose(v*scale*scale, vt) + + +def check_ppf_private(distfn, arg, msg): + #fails by design for truncnorm self.nb not defined + ppfs = distfn._ppf(np.array([0.1, 0.5, 0.9]), *arg) + npt.assert_(not np.any(np.isnan(ppfs)), msg + 'ppf private is nan') + + if __name__ == "__main__": - #nose.run(argv=['', __file__]) - nose.runmodule(argv=[__file__,'-s'], exit=False) + npt.run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_discrete_basic.py b/pywafo/src/wafo/stats/tests/test_discrete_basic.py index 659f809..d01d9c1 100644 --- a/pywafo/src/wafo/stats/tests/test_discrete_basic.py +++ b/pywafo/src/wafo/stats/tests/test_discrete_basic.py @@ -1,268 +1,202 @@ -import numpy.testing as npt -import numpy as np -import nose - -from wafo import stats - -DECIMAL_meanvar = 0#1 # was 0 - -distdiscrete = [ - ['bernoulli',(0.3,)], - ['binom', (5, 0.4)], - ['boltzmann',(1.4, 19)], - ['dlaplace', (0.8,)], #0.5 - ['geom', (0.5,)], - ['hypergeom',(30, 12, 6)], - ['hypergeom',(21,3,12)], #numpy.random (3,18,12) numpy ticket:921 - ['hypergeom',(21,18,11)], #numpy.random (18,3,11) numpy ticket:921 - ['logser', (0.6,)], # reenabled, numpy ticket:921 - ['nbinom', (5, 0.5)], - ['nbinom', (0.4, 0.4)], #from tickets: 583 - ['planck', (0.51,)], #4.1 - ['poisson', (0.6,)], - ['randint', (7, 31)], - ['skellam', (15, 8)]] -# ['zipf', (4,)] ] # arg=4 is ok, - # Zipf broken for arg = 2, e.g. weird .stats - # looking closer, mean, var should be inf for arg=2 - - -#@npt.dec.slow -def test_discrete_basic(): - for distname, arg in distdiscrete: - distfn = getattr(stats,distname) - #assert stats.dlaplace.rvs(0.8) is not None - np.random.seed(9765456) - rvs = distfn.rvs(size=2000,*arg) - supp = np.unique(rvs) - m,v = distfn.stats(*arg) - #yield npt.assert_almost_equal(rvs.mean(), m, decimal=4,err_msg='mean') - #yield npt.assert_almost_equal, rvs.mean(), m, 2, 'mean' # does not work - yield check_sample_meanvar, rvs.mean(), m, distname + ' sample mean test' - yield check_sample_meanvar, rvs.var(), v, distname + ' sample var test' - yield check_cdf_ppf, distfn, arg, distname + ' cdf_ppf' - yield check_cdf_ppf2, distfn, arg, supp, distname + ' cdf_ppf' - yield check_pmf_cdf, distfn, arg, distname + ' pmf_cdf' - - # zipf doesn't fail, but generates floating point warnings. - # Should be checked. - if not distname in ['zipf']: - yield check_oth, distfn, arg, distname + ' oth' - skurt = stats.kurtosis(rvs) - sskew = stats.skew(rvs) - yield check_sample_skew_kurt, distfn, arg, skurt, sskew, \ - distname + ' skew_kurt' - - # dlaplace doesn't fail, but generates lots of floating point warnings. - # Should be checked. - if not distname in ['dlaplace']: #['logser']: #known failure, fixed - alpha = 0.01 - yield check_discrete_chisquare, distfn, arg, rvs, alpha, \ - distname + ' chisquare' - -@npt.dec.slow -def test_discrete_extra(): - for distname, arg in distdiscrete: - distfn = getattr(stats,distname) - yield check_ppf_limits, distfn, arg, distname + \ - ' ppf limit test' - yield check_isf_limits, distfn, arg, distname + \ - ' isf limit test' - yield check_entropy, distfn, arg, distname + \ - ' entropy nan test' - -@npt.dec.skipif(True) -def test_discrete_private(): - #testing private methods mostly for debugging - # some tests might fail by design, - # e.g. incorrect definition of distfn.a and distfn.b - for distname, arg in distdiscrete: - distfn = getattr(stats,distname) - rvs = distfn.rvs(size=10000,*arg) - m,v = distfn.stats(*arg) - - yield check_ppf_ppf, distfn, arg - yield check_cdf_ppf_private, distfn, arg, distname - yield check_generic_moment, distfn, arg, m, 1, 3 # last is decimal - yield check_generic_moment, distfn, arg, v+m*m, 2, 3 # last is decimal - yield check_moment_frozen, distfn, arg, m, 1, 3 # last is decimal - yield check_moment_frozen, distfn, arg, v+m*m, 2, 3 # last is decimal - - -def check_sample_meanvar(sm,m,msg): - if not np.isinf(m): - npt.assert_almost_equal(sm, m, decimal=DECIMAL_meanvar, err_msg=msg + \ - ' - finite moment') - else: - npt.assert_(sm > 10000, msg='infinite moment, sm = ' + str(sm)) - -def check_sample_var(sm,m,msg): - npt.assert_almost_equal(sm, m, decimal=DECIMAL_meanvar, err_msg= msg + 'var') - -def check_cdf_ppf(distfn,arg,msg): - ppf05 = distfn.ppf(0.5,*arg) - cdf05 = distfn.cdf(ppf05,*arg) - npt.assert_almost_equal(distfn.ppf(cdf05-1e-6,*arg),ppf05, - err_msg=msg + 'ppf-cdf-median') - npt.assert_((distfn.ppf(cdf05+1e-4,*arg)>ppf05), msg + 'ppf-cdf-next') - -def check_cdf_ppf2(distfn,arg,supp,msg): - npt.assert_array_equal(distfn.ppf(distfn.cdf(supp,*arg),*arg), - supp, msg + '-roundtrip') - npt.assert_array_equal(distfn.ppf(distfn.cdf(supp,*arg)-1e-8,*arg), - supp, msg + '-roundtrip') - # -1e-8 could cause an error if pmf < 1e-8 - - -def check_cdf_ppf_private(distfn,arg,msg): - ppf05 = distfn._ppf(0.5,*arg) - cdf05 = distfn.cdf(ppf05,*arg) - npt.assert_almost_equal(distfn._ppf(cdf05-1e-6,*arg),ppf05, - err_msg=msg + '_ppf-cdf-median ') - npt.assert_((distfn._ppf(cdf05+1e-4,*arg)>ppf05), msg + '_ppf-cdf-next') - -def check_ppf_ppf(distfn, arg): - npt.assert_(distfn.ppf(0.5,*arg) < np.inf) - ppfs = distfn.ppf([0.5,0.9],*arg) - ppf_s = [distfn._ppf(0.5,*arg), distfn._ppf(0.9,*arg)] - npt.assert_(np.all(ppfs < np.inf)) - npt.assert_(ppf_s[0] == distfn.ppf(0.5,*arg)) - npt.assert_(ppf_s[1] == distfn.ppf(0.9,*arg)) - npt.assert_(ppf_s[0] == ppfs[0]) - npt.assert_(ppf_s[1] == ppfs[1]) - -def check_pmf_cdf(distfn, arg, msg): - startind = np.int(distfn._ppf(0.01,*arg)-1) - index = range(startind,startind+10) - cdfs = distfn.cdf(index,*arg) - npt.assert_almost_equal(cdfs, distfn.pmf(index, *arg).cumsum() + \ - cdfs[0] - distfn.pmf(index[0],*arg), - decimal=4, err_msg= msg + 'pmf-cdf') - -def check_generic_moment(distfn, arg, m, k, decim): - npt.assert_almost_equal(distfn.generic_moment(k,*arg), m, decimal=decim, - err_msg= str(distfn) + ' generic moment test') - -def check_moment_frozen(distfn, arg, m, k, decim): - npt.assert_almost_equal(distfn(*arg).moment(k), m, decimal=decim, - err_msg= str(distfn) + ' frozen moment test') - -def check_oth(distfn, arg, msg): - #checking other methods of distfn - meanint = round(float(distfn.stats(*arg)[0])) # closest integer to mean - npt.assert_almost_equal(distfn.sf(meanint, *arg), 1 - \ - distfn.cdf(meanint, *arg), decimal=8) - median_sf = distfn.isf(0.5, *arg) - - npt.assert_(distfn.sf(median_sf - 1, *arg) > 0.5) - npt.assert_(distfn.cdf(median_sf + 1, *arg) > 0.5) - npt.assert_equal(distfn.isf(0.5, *arg), distfn.ppf(0.5, *arg)) - -#next 3 functions copied from test_continous_extra -# adjusted - -def check_ppf_limits(distfn,arg,msg): - below,low,upp,above = distfn.ppf([-1,0,1,2], *arg) - #print distfn.name, distfn.a, low, distfn.b, upp - #print distfn.name,below,low,upp,above - assert_equal_inf_nan(distfn.a-1,low, msg + 'ppf lower bound') - assert_equal_inf_nan(distfn.b,upp, msg + 'ppf upper bound') - npt.assert_(np.isnan(below), msg + 'ppf out of bounds - below') - npt.assert_(np.isnan(above), msg + 'ppf out of bounds - above') - -def check_isf_limits(distfn,arg,msg): - below,low,upp,above = distfn.isf([-1,0,1,2], *arg) - #print distfn.name, distfn.a, low, distfn.b, upp - #print distfn.name,below,low,upp,above - assert_equal_inf_nan(distfn.a-1,upp, msg + 'isf lower bound') - assert_equal_inf_nan(distfn.b,low, msg + 'isf upper bound') - npt.assert_(np.isnan(below), msg + 'isf out of bounds - below') - npt.assert_(np.isnan(above), msg + 'isf out of bounds - above') - -def assert_equal_inf_nan(v1,v2,msg): - npt.assert_(not np.isnan(v1)) - if not np.isinf(v1): - npt.assert_almost_equal(v1, v2, decimal=10, err_msg = msg + \ - ' - finite') - else: - npt.assert_(np.isinf(v2) or np.isnan(v2), - msg + ' - infinite, v2=%s' % str(v2)) - -def check_sample_skew_kurt(distfn, arg, sk, ss, msg): - k,s = distfn.stats(moment='ks',*arg) - check_sample_meanvar, sk, k, msg + 'sample skew test' - check_sample_meanvar, ss, s, msg + 'sample kurtosis test' - - -def check_entropy(distfn,arg,msg): - ent = distfn.entropy(*arg) - #print 'Entropy =', ent - npt.assert_(not np.isnan(ent), msg + 'test Entropy is nan') - - -def check_discrete_chisquare(distfn, arg, rvs, alpha, msg): - '''perform chisquare test for random sample of a discrete distribution - - Parameters - ---------- - distname : string - name of distribution function - arg : sequence - parameters of distribution - alpha : float - significance level, threshold for p-value - - Returns - ------- - result : bool - 0 if test passes, 1 if test fails - - uses global variable debug for printing results - ''' - - # define parameters for test -## n=2000 - n = len(rvs) - nsupp = 20 - wsupp = 1.0/nsupp - -## distfn = getattr(stats, distname) -## np.random.seed(9765456) -## rvs = distfn.rvs(size=n,*arg) - - # construct intervals with minimum mass 1/nsupp - # intervalls are left-half-open as in a cdf difference - distsupport = xrange(max(distfn.a, -1000), min(distfn.b, 1000) + 1) - last = 0 - distsupp = [max(distfn.a, -1000)] - distmass = [] - for ii in distsupport: - current = distfn.cdf(ii,*arg) - if current - last >= wsupp-1e-14: - distsupp.append(ii) - distmass.append(current - last) - last = current - if current > (1-wsupp): - break - if distsupp[-1] < distfn.b: - distsupp.append(distfn.b) - distmass.append(1-last) - distsupp = np.array(distsupp) - distmass = np.array(distmass) - - # convert intervals to right-half-open as required by histogram - histsupp = distsupp+1e-8 - histsupp[0] = distfn.a - - # find sample frequencies and perform chisquare test - freq,hsupp = np.histogram(rvs,histsupp) - cdfs = distfn.cdf(distsupp,*arg) - (chis,pval) = stats.chisquare(np.array(freq),n*distmass) - - npt.assert_(pval > alpha, 'chisquare - test for %s' - ' at arg = %s with pval = %s' % (msg,str(arg),str(pval))) - - -if __name__ == "__main__": - #nose.run(argv=['', __file__]) - nose.runmodule(argv=[__file__,'-s'], exit=False) +from __future__ import division, print_function, absolute_import + +import numpy.testing as npt +import numpy as np +try: + from scipy.lib.six import xrange +except: + pass +from scipy import stats +from .common_tests import (check_normalization, check_moment, check_mean_expect, + check_var_expect, check_skew_expect, check_kurt_expect, + check_entropy, check_private_entropy, check_edge_support, + check_named_args) +knf = npt.dec.knownfailureif + +distdiscrete = [ + ['bernoulli',(0.3,)], + ['binom', (5, 0.4)], + ['boltzmann',(1.4, 19)], + ['dlaplace', (0.8,)], # 0.5 + ['geom', (0.5,)], + ['hypergeom',(30, 12, 6)], + ['hypergeom',(21,3,12)], # numpy.random (3,18,12) numpy ticket:921 + ['hypergeom',(21,18,11)], # numpy.random (18,3,11) numpy ticket:921 + ['logser', (0.6,)], # reenabled, numpy ticket:921 + ['nbinom', (5, 0.5)], + ['nbinom', (0.4, 0.4)], # from tickets: 583 + ['planck', (0.51,)], # 4.1 + ['poisson', (0.6,)], + ['randint', (7, 31)], + ['skellam', (15, 8)], + ['zipf', (6.5,)] +] + + +def test_discrete_basic(): + for distname, arg in distdiscrete: + distfn = getattr(stats, distname) + np.random.seed(9765456) + rvs = distfn.rvs(size=2000, *arg) + supp = np.unique(rvs) + m, v = distfn.stats(*arg) + yield check_cdf_ppf, distfn, arg, supp, distname + ' cdf_ppf' + + yield check_pmf_cdf, distfn, arg, distname + yield check_oth, distfn, arg, supp, distname + ' oth' + yield check_edge_support, distfn, arg + + alpha = 0.01 + yield check_discrete_chisquare, distfn, arg, rvs, alpha, \ + distname + ' chisquare' + + seen = set() + for distname, arg in distdiscrete: + if distname in seen: + continue + seen.add(distname) + distfn = getattr(stats,distname) + locscale_defaults = (0,) + meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf, + distfn.logsf] + # make sure arguments are within support + spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0, } + k = spec_k.get(distname, 1) + yield check_named_args, distfn, k, arg, locscale_defaults, meths + yield check_scale_docstring, distfn + + # Entropy + yield check_entropy, distfn, arg, distname + if distfn.__class__._entropy != stats.rv_discrete._entropy: + yield check_private_entropy, distfn, arg, stats.rv_discrete + + +def test_moments(): + for distname, arg in distdiscrete: + distfn = getattr(stats,distname) + m, v, s, k = distfn.stats(*arg, moments='mvsk') + yield check_normalization, distfn, arg, distname + + # compare `stats` and `moment` methods + yield check_moment, distfn, arg, m, v, distname + yield check_mean_expect, distfn, arg, m, distname + yield check_var_expect, distfn, arg, m, v, distname + yield check_skew_expect, distfn, arg, m, v, s, distname + + cond = distname in ['zipf'] + msg = distname + ' fails kurtosis' + yield knf(cond, msg)(check_kurt_expect), distfn, arg, m, v, k, distname + + # frozen distr moments + yield check_moment_frozen, distfn, arg, m, 1 + yield check_moment_frozen, distfn, arg, v+m*m, 2 + + +def check_cdf_ppf(distfn, arg, supp, msg): + # cdf is a step function, and ppf(q) = min{k : cdf(k) >= q, k integer} + npt.assert_array_equal(distfn.ppf(distfn.cdf(supp, *arg), *arg), + supp, msg + '-roundtrip') + npt.assert_array_equal(distfn.ppf(distfn.cdf(supp, *arg) - 1e-8, *arg), + supp, msg + '-roundtrip') + supp1 = supp[supp < distfn.b] + npt.assert_array_equal(distfn.ppf(distfn.cdf(supp1, *arg) + 1e-8, *arg), + supp1 + distfn.inc, msg + 'ppf-cdf-next') + # -1e-8 could cause an error if pmf < 1e-8 + + +def check_pmf_cdf(distfn, arg, distname): + startind = np.int(distfn.ppf(0.01, *arg) - 1) + index = list(range(startind, startind + 10)) + cdfs, pmfs_cum = distfn.cdf(index,*arg), distfn.pmf(index, *arg).cumsum() + + atol, rtol = 1e-10, 1e-10 + if distname == 'skellam': # ncx2 accuracy + atol, rtol = 1e-5, 1e-5 + npt.assert_allclose(cdfs - cdfs[0], pmfs_cum - pmfs_cum[0], + atol=atol, rtol=rtol) + + +def check_moment_frozen(distfn, arg, m, k): + npt.assert_allclose(distfn(*arg).moment(k), m, + atol=1e-10, rtol=1e-10) + + +def check_oth(distfn, arg, supp, msg): + # checking other methods of distfn + npt.assert_allclose(distfn.sf(supp, *arg), 1. - distfn.cdf(supp, *arg), + atol=1e-10, rtol=1e-10) + + q = np.linspace(0.01, 0.99, 20) + npt.assert_allclose(distfn.isf(q, *arg), distfn.ppf(1. - q, *arg), + atol=1e-10, rtol=1e-10) + + median_sf = distfn.isf(0.5, *arg) + npt.assert_(distfn.sf(median_sf - 1, *arg) > 0.5) + npt.assert_(distfn.cdf(median_sf + 1, *arg) > 0.5) + + +def check_discrete_chisquare(distfn, arg, rvs, alpha, msg): + """Perform chisquare test for random sample of a discrete distribution + + Parameters + ---------- + distname : string + name of distribution function + arg : sequence + parameters of distribution + alpha : float + significance level, threshold for p-value + + Returns + ------- + result : bool + 0 if test passes, 1 if test fails + + uses global variable debug for printing results + + """ + n = len(rvs) + nsupp = 20 + wsupp = 1.0/nsupp + + # construct intervals with minimum mass 1/nsupp + # intervals are left-half-open as in a cdf difference + distsupport = xrange(max(distfn.a, -1000), min(distfn.b, 1000) + 1) + last = 0 + distsupp = [max(distfn.a, -1000)] + distmass = [] + for ii in distsupport: + current = distfn.cdf(ii,*arg) + if current - last >= wsupp-1e-14: + distsupp.append(ii) + distmass.append(current - last) + last = current + if current > (1-wsupp): + break + if distsupp[-1] < distfn.b: + distsupp.append(distfn.b) + distmass.append(1-last) + distsupp = np.array(distsupp) + distmass = np.array(distmass) + + # convert intervals to right-half-open as required by histogram + histsupp = distsupp+1e-8 + histsupp[0] = distfn.a + + # find sample frequencies and perform chisquare test + freq,hsupp = np.histogram(rvs,histsupp) + cdfs = distfn.cdf(distsupp,*arg) + (chis,pval) = stats.chisquare(np.array(freq),n*distmass) + + npt.assert_(pval > alpha, 'chisquare - test for %s' + ' at arg = %s with pval = %s' % (msg,str(arg),str(pval))) + + +def check_scale_docstring(distfn): + if distfn.__doc__ is not None: + # Docstrings can be stripped if interpreter is run with -OO + npt.assert_('scale' not in distfn.__doc__) + + +if __name__ == "__main__": + npt.run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_distributions.py b/pywafo/src/wafo/stats/tests/test_distributions.py index 8178d32..8016e4f 100644 --- a/pywafo/src/wafo/stats/tests/test_distributions.py +++ b/pywafo/src/wafo/stats/tests/test_distributions.py @@ -1,36 +1,36 @@ """ Test functions for stats module """ +from __future__ import division, print_function, absolute_import -from numpy.testing import TestCase, run_module_suite, assert_equal, \ - assert_array_equal, assert_almost_equal, assert_array_almost_equal, \ - assert_allclose, assert_, rand, dec +import warnings +import re +import sys +from numpy.testing import (TestCase, run_module_suite, assert_equal, + assert_array_equal, assert_almost_equal, assert_array_almost_equal, + assert_allclose, assert_, assert_raises, rand, dec) +from nose import SkipTest import numpy import numpy as np from numpy import typecodes, array -import wafo.stats as stats -from wafo.stats.distributions import argsreduce - -def kolmogorov_check(diststr, args=(), N=20, significance=0.01): - qtest = stats.ksone.isf(significance, N) - cdf = eval('stats.'+diststr+'.cdf') - dist = eval('stats.'+diststr) - # Get random numbers - kwds = {'size':N} - vals = numpy.sort(dist.rvs(*args, **kwds)) - cdfvals = cdf(vals, *args) - q = max(abs(cdfvals - np.arange(1.0, N+1)/N)) - assert_(q < qtest, msg="Failed q=%f, bound=%f, alpha=%f" % (q, qtest, significance)) - return +from scipy.lib._version import NumpyVersion +from scipy import special +import scipy.stats as stats +from scipy.stats._distn_infrastructure import argsreduce +from scipy.special import xlogy + + +# python -OO strips docstrings +DOCSTRINGS_STRIPPED = sys.flags.optimize > 1 # generate test cases to test cdf and distribution consistency dists = ['uniform','norm','lognorm','expon','beta', 'powerlaw','bradford','burr','fisk','cauchy','halfcauchy', 'foldcauchy','gamma','gengamma','loggamma', - 'alpha','anglit','arcsine','betaprime','erlang', + 'alpha','anglit','arcsine','betaprime', 'dgamma','exponweib','exponpow','frechet_l','frechet_r', 'gilbrat','f','ncf','chi2','chi','nakagami','genpareto', 'genextreme','genhalflogistic','pareto','lomax','halfnorm', @@ -38,11 +38,23 @@ dists = ['uniform','norm','lognorm','expon','beta', 'weibull_min','weibull_max','dweibull','maxwell','rayleigh', 'genlogistic', 'logistic','gumbel_l','gumbel_r','gompertz', 'hypsecant', 'laplace', 'reciprocal','triang','tukeylambda', - 'vonmises'] + 'vonmises', 'vonmises_line', 'pearson3'] + +# check function for test generator +def check_distribution(dist, args, alpha): + D,pval = stats.kstest(dist,'', args=args, N=1000) + if (pval < alpha): + D,pval = stats.kstest(dist,'',args=args, N=1000) + # if (pval < alpha): + # D,pval = stats.kstest(dist,'',args=args, N=1000) + assert_(pval > alpha, msg="D = " + str(D) + "; pval = " + str(pval) + + "; alpha = " + str(alpha) + "\nargs = " + str(args)) # nose test generator + + def test_all_distributions(): for dist in dists: distfunc = getattr(stats, dist) @@ -50,9 +62,8 @@ def test_all_distributions(): alpha = 0.01 if dist == 'fatiguelife': alpha = 0.001 - if dist == 'erlang': - args = (4,)+tuple(rand(2)) - elif dist == 'frechet': + + if dist == 'frechet': args = tuple(2*rand(1))+(0,)+tuple(2*rand(2)) elif dist == 'triang': args = tuple(rand(nargs)) @@ -66,131 +77,19 @@ def test_all_distributions(): args = tuple(1.0+rand(nargs)) else: args = tuple(1.0+rand(nargs)) - yield check_distribution, dist, args, alpha -def test_ppf_and_isf_all_distributions(): - for dist in dists: - distfunc = getattr(stats, dist) - nargs = distfunc.numargs - for check_fun in [check_distribution_ppf, check_distribution_isf]: - if dist == 'erlang': - args = (4,)+tuple(rand(2)) - elif dist == 'frechet': - args = tuple(2*rand(1))+(0,)+tuple(2*rand(2)) - elif dist == 'triang': - args = tuple(rand(nargs)) - elif dist == 'reciprocal': - vals = rand(nargs) - vals[1] = vals[0] + 1.0 - args = tuple(vals) - elif dist == 'vonmises': - yield check_fun, dist, (10,) - yield check_fun, dist, (101,) - args = tuple(1.0+rand(nargs)) - else: - args = tuple(1.0+rand(nargs)) - yield check_fun, dist, args - -def check_distribution_ppf(diststr, args): - dist = getattr(stats, diststr) - n = dist.numargs - loc, scale = 0, 1 - rv = dist(*args) - loc_scale = rv.par[n:] - if len(loc_scale)>0: - loc = loc_scale[0] - if len(loc_scale)>1: - scale = loc_scale[1] - - limits = rv.ppf([0, 1]) #[1-1e-15, 1e-15]) - true_limits = np.array([rv.dist.a*scale + loc, rv.dist.b*scale + loc]) - - assert_allclose(limits, true_limits, atol=1e-7, err_msg='Expected support for distribution') - -def check_distribution_isf(diststr, args): - dist = getattr(stats, diststr) - n = dist.numargs - loc, scale = 0, 1 - rv = dist(*args) - loc_scale = rv.par[n:] - if len(loc_scale)>0: - loc = loc_scale[0] - if len(loc_scale)>1: - scale = loc_scale[1] - - limits = rv.isf([1, 0]) #[1-1e-15, 1e-15]) - true_limits = np.array([rv.dist.a*scale + loc, rv.dist.b*scale + loc]) - - assert_allclose(limits, true_limits, atol=1e-7, err_msg='Expected support for distribution') - -class TestFitMethod(TestCase): - skip = ['ncf'] - - @dec.slow - def test_fit(self): - for func, dist, args, alpha in test_all_distributions(): - if dist in self.skip: - continue - distfunc = getattr(stats, dist) - res = distfunc.rvs(*args, **{'size':200}) - vals = distfunc.fit(res) - vals2 = distfunc.fit(res, optimizer='powell') - # Only check the length of the return - # FIXME: should check the actual results to see if we are 'close' - # to what was created --- but what is 'close' enough - if dist in ['erlang', 'frechet']: - assert_(len(vals)==len(args)) - assert_(len(vals2)==len(args)) - else: - assert_(len(vals) == 2+len(args)) - assert_(len(vals2)==2+len(args)) - - @dec.slow - def test_fix_fit(self): - for func, dist, args, alpha in test_all_distributions(): - # Not sure why 'ncf', and 'beta' are failing - # erlang and frechet have different len(args) than distfunc.numargs - if dist in self.skip + ['erlang', 'frechet', 'beta']: - continue - distfunc = getattr(stats, dist) - res = distfunc.rvs(*args, **{'size':200}) - #print(distfunc.name) - vals = distfunc.fit(res,floc=0) - vals2 = distfunc.fit(res,fscale=1) - assert_(len(vals) == 2+len(args)) - assert_(vals[-2] == 0) - assert_(vals2[-1] == 1) - assert_(len(vals2) == 2+len(args)) - if len(args) > 0: - vals3 = distfunc.fit(res, f0=args[0]) - assert_(len(vals3) == 2+len(args)) - assert_(vals3[0] == args[0]) - if len(args) > 1: - vals4 = distfunc.fit(res, f1=args[1]) - assert_(len(vals4) == 2+len(args)) - assert_(vals4[1] == args[1]) - if len(args) > 2: - vals5 = distfunc.fit(res, f2=args[2]) - assert_(len(vals5) == 2+len(args)) - assert_(vals5[2] == args[2]) - -# check function for test generator -def check_distribution(dist, args, alpha): - D,pval = stats.kstest(dist,'', args=args, N=1000) - if (pval < alpha): - D,pval = stats.kstest(dist,'',args=args, N=1000) - #if (pval < alpha): - # D,pval = stats.kstest(dist,'',args=args, N=1000) - assert_(pval > alpha, msg="D = " + str(D) + "; pval = " + str(pval) + \ - "; alpha = " + str(alpha) + "\nargs = " + str(args)) + yield check_distribution, dist, args, alpha def check_vonmises_pdf_periodic(k,l,s,x): vm = stats.vonmises(k,loc=l,scale=s) - assert_almost_equal(vm.pdf(x),vm.pdf(x%(2*numpy.pi*s))) + assert_almost_equal(vm.pdf(x),vm.pdf(x % (2*numpy.pi*s))) + + def check_vonmises_cdf_periodic(k,l,s,x): vm = stats.vonmises(k,loc=l,scale=s) - assert_almost_equal(vm.cdf(x)%1,vm.cdf(x%(2*numpy.pi*s))%1) + assert_almost_equal(vm.cdf(x) % 1,vm.cdf(x % (2*numpy.pi*s)) % 1) + def test_vonmises_pdf_periodic(): for k in [0.1, 1, 101]: @@ -203,6 +102,12 @@ def test_vonmises_pdf_periodic(): yield check_vonmises_cdf_periodic, k, 1, 1, x yield check_vonmises_cdf_periodic, k, 0, 10, x + +def test_vonmises_line_support(): + assert_equal(stats.vonmises_line.a, -np.pi) + assert_equal(stats.vonmises_line.b, np.pi) + + class TestRandInt(TestCase): def test_rvs(self): vals = stats.randint.rvs(5,30,size=100) @@ -213,7 +118,7 @@ class TestRandInt(TestCase): assert_(vals.dtype.char in typecodes['AllInteger']) val = stats.randint.rvs(15,46) assert_((val >= 15) & (val < 46)) - assert_(isinstance(val, numpy.ScalarType), msg=`type(val)`) + assert_(isinstance(val, numpy.ScalarType), msg=repr(type(val))) val = stats.randint(15,46).rvs(3) assert_(val.dtype.char in typecodes['AllInteger']) @@ -226,10 +131,11 @@ class TestRandInt(TestCase): def test_cdf(self): x = numpy.r_[0:36:100j] k = numpy.floor(x) - out = numpy.select([k>=30,k>=5],[1.0,(k-5.0+1)/(30-5.0)],0) + out = numpy.select([k >= 30,k >= 5],[1.0,(k-5.0+1)/(30-5.0)],0) vals = stats.randint.cdf(x,5,30) assert_array_almost_equal(vals, out, decimal=12) + class TestBinom(TestCase): def test_rvs(self): vals = stats.binom.rvs(10, 0.75, size=(2, 50)) @@ -242,6 +148,29 @@ class TestBinom(TestCase): assert_(isinstance(val, numpy.ndarray)) assert_(val.dtype.char in typecodes['AllInteger']) + def test_pmf(self): + # regression test for Ticket #1842 + vals1 = stats.binom.pmf(100, 100,1) + vals2 = stats.binom.pmf(0, 100,0) + assert_allclose(vals1, 1.0, rtol=1e-15, atol=0) + assert_allclose(vals2, 1.0, rtol=1e-15, atol=0) + + def test_entropy(self): + # Basic entropy tests. + b = stats.binom(2, 0.5) + expected_p = np.array([0.25, 0.5, 0.25]) + expected_h = -sum(xlogy(expected_p, expected_p)) + h = b.entropy() + assert_allclose(h, expected_h) + + b = stats.binom(2, 0.0) + h = b.entropy() + assert_equal(h, 0.0) + + b = stats.binom(2, 1.0) + h = b.entropy() + assert_equal(h, 0.0) + class TestBernoulli(TestCase): def test_rvs(self): @@ -255,6 +184,22 @@ class TestBernoulli(TestCase): assert_(isinstance(val, numpy.ndarray)) assert_(val.dtype.char in typecodes['AllInteger']) + def test_entropy(self): + # Simple tests of entropy. + b = stats.bernoulli(0.25) + expected_h = -0.25*np.log(0.25) - 0.75*np.log(0.75) + h = b.entropy() + assert_allclose(h, expected_h) + + b = stats.bernoulli(0.0) + h = b.entropy() + assert_equal(h, 0.0) + + b = stats.bernoulli(1.0) + h = b.entropy() + assert_equal(h, 0.0) + + class TestNBinom(TestCase): def test_rvs(self): vals = stats.nbinom.rvs(10, 0.75, size=(2, 50)) @@ -267,6 +212,12 @@ class TestNBinom(TestCase): assert_(isinstance(val, numpy.ndarray)) assert_(val.dtype.char in typecodes['AllInteger']) + def test_pmf(self): + # regression test for ticket 1779 + assert_allclose(np.exp(stats.nbinom.logpmf(700, 721, 0.52)), + stats.nbinom.pmf(700, 721, 0.52)) + + class TestGeom(TestCase): def test_rvs(self): vals = stats.geom.rvs(0.75, size=(2, 50)) @@ -283,30 +234,69 @@ class TestGeom(TestCase): vals = stats.geom.pmf([1,2,3],0.5) assert_array_almost_equal(vals,[0.5,0.25,0.125]) + def test_logpmf(self): + # regression test for ticket 1793 + vals1 = np.log(stats.geom.pmf([1,2,3], 0.5)) + vals2 = stats.geom.logpmf([1,2,3], 0.5) + assert_allclose(vals1, vals2, rtol=1e-15, atol=0) + def test_cdf_sf(self): - vals = stats.geom.cdf([1,2,3],0.5) - vals_sf = stats.geom.sf([1,2,3],0.5) - expected = array([0.5,0.75,0.875]) - assert_array_almost_equal(vals,expected) - assert_array_almost_equal(vals_sf,1-expected) + vals = stats.geom.cdf([1, 2, 3], 0.5) + vals_sf = stats.geom.sf([1, 2, 3], 0.5) + expected = array([0.5, 0.75, 0.875]) + assert_array_almost_equal(vals, expected) + assert_array_almost_equal(vals_sf, 1-expected) + + def test_logcdf_logsf(self): + vals = stats.geom.logcdf([1, 2, 3], 0.5) + vals_sf = stats.geom.logsf([1, 2, 3], 0.5) + expected = array([0.5, 0.75, 0.875]) + assert_array_almost_equal(vals, np.log(expected)) + assert_array_almost_equal(vals_sf, np.log1p(-expected)) + + def test_ppf(self): + vals = stats.geom.ppf([0.5, 0.75, 0.875], 0.5) + expected = array([1.0, 2.0, 3.0]) + assert_array_almost_equal(vals, expected) + class TestTruncnorm(TestCase): def test_ppf_ticket1131(self): - vals = stats.truncnorm.ppf([-0.5,0,1e-4,0.5, 1-1e-4,1,2],-1., 1., - loc=[3]*7,scale=2) - NaN = np.NaN - expected = np.array([ NaN, 1. , 1.00056419, 3. - , 4.99943581, 5. , NaN]) + vals = stats.truncnorm.ppf([-0.5,0,1e-4,0.5, 1-1e-4,1,2], -1., 1., + loc=[3]*7, scale=2) + expected = np.array([np.nan, 1, 1.00056419, 3, 4.99943581, 5, np.nan]) assert_array_almost_equal(vals, expected) - + def test_isf_ticket1131(self): - NaN = np.NaN - vals = stats.truncnorm.isf([-0.5,0,1e-4,0.5, 1-1e-4,1,2],-1., 1., - loc=[3]*7,scale=2) - expected = np.array([ NaN, 5. , 4.99943581, 3., - 1.00056419, 1. , NaN]) + vals = stats.truncnorm.isf([-0.5,0,1e-4,0.5, 1-1e-4,1,2], -1., 1., + loc=[3]*7, scale=2) + expected = np.array([np.nan, 5, 4.99943581, 3, 1.00056419, 1, np.nan]) assert_array_almost_equal(vals, expected) + def test_gh_2477_small_values(self): + # Check a case that worked in the original issue. + low, high = -11, -10 + x = stats.truncnorm.rvs(low, high, 0, 1, size=10) + assert_(low < x.min() < x.max() < high) + # Check a case that failed in the original issue. + low, high = 10, 11 + x = stats.truncnorm.rvs(low, high, 0, 1, size=10) + assert_(low < x.min() < x.max() < high) + + def test_gh_2477_large_values(self): + # Check a case that fails because of extreme tailness. + raise SkipTest('truncnorm rvs is know to fail at extreme tails') + low, high = 100, 101 + x = stats.truncnorm.rvs(low, high, 0, 1, size=10) + assert_(low < x.min() < x.max() < high) + + def test_gh_1489_trac_962_rvs(self): + # Check the original example. + low, high = 10, 15 + x = stats.truncnorm.rvs(low, high, 0, 1, size=10) + assert_(low < x.min() < x.max() < high) + + class TestHypergeom(TestCase): def test_rvs(self): vals = stats.hypergeom.rvs(20, 10, 3, size=(2, 50)) @@ -331,7 +321,7 @@ class TestHypergeom(TestCase): assert_almost_equal(hgpmf, 0.0010114963068932233, 11) def test_precision2(self): - """Test hypergeom precision for large numbers. See #1218.""" + # Test hypergeom precision for large numbers. See #1218. # Results compared with those from R. oranges = 9.9e4 pears = 1.1e5 @@ -350,6 +340,36 @@ class TestHypergeom(TestCase): expected2 = [1, 0.1237904, 6.511452e-34, 3.277667e-69] assert_allclose(res2, expected2, atol=0, rtol=5e-7) + def test_entropy(self): + # Simple tests of entropy. + hg = stats.hypergeom(4, 1, 1) + h = hg.entropy() + expected_p = np.array([0.75, 0.25]) + expected_h = -np.sum(xlogy(expected_p, expected_p)) + assert_allclose(h, expected_h) + + hg = stats.hypergeom(1, 1, 1) + h = hg.entropy() + assert_equal(h, 0.0) + + +class TestLoggamma(TestCase): + + def test_stats(self): + # The following precomputed values are from the table in section 2.2 + # of "A Statistical Study of Log-Gamma Distribution", by Ping Shing + # Chan (thesis, McMaster University, 1993). + table = np.array([ + # c, mean, var, skew, exc. kurt. + 0.5, -1.9635, 4.9348, -1.5351, 4.0000, + 1.0, -0.5772, 1.6449, -1.1395, 2.4000, + 12.0, 2.4427, 0.0869, -0.2946, 0.1735, + ]).reshape(-1, 5) + for c, mean, var, skew, kurt in table: + computed = stats.loggamma.stats(c, moments='msvk') + assert_array_almost_equal(computed, [mean, var, skew, kurt], + decimal=4) + class TestLogser(TestCase): def test_rvs(self): @@ -363,6 +383,102 @@ class TestLogser(TestCase): assert_(isinstance(val, numpy.ndarray)) assert_(val.dtype.char in typecodes['AllInteger']) + +class TestPareto(TestCase): + def test_stats(self): + # Check the stats() method with some simple values. Also check + # that the calculations do not trigger RuntimeWarnings. + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + + m, v, s, k = stats.pareto.stats(0.5, moments='mvsk') + assert_equal(m, np.inf) + assert_equal(v, np.inf) + assert_equal(s, np.nan) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(1.0, moments='mvsk') + assert_equal(m, np.inf) + assert_equal(v, np.inf) + assert_equal(s, np.nan) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(1.5, moments='mvsk') + assert_equal(m, 3.0) + assert_equal(v, np.inf) + assert_equal(s, np.nan) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(2.0, moments='mvsk') + assert_equal(m, 2.0) + assert_equal(v, np.inf) + assert_equal(s, np.nan) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(2.5, moments='mvsk') + assert_allclose(m, 2.5 / 1.5) + assert_allclose(v, 2.5 / (1.5*1.5*0.5)) + assert_equal(s, np.nan) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(3.0, moments='mvsk') + assert_allclose(m, 1.5) + assert_allclose(v, 0.75) + assert_equal(s, np.nan) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(3.5, moments='mvsk') + assert_allclose(m, 3.5 / 2.5) + assert_allclose(v, 3.5 / (2.5*2.5*1.5)) + assert_allclose(s, (2*4.5/0.5)*np.sqrt(1.5/3.5)) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(4.0, moments='mvsk') + assert_allclose(m, 4.0 / 3.0) + assert_allclose(v, 4.0 / 18.0) + assert_allclose(s, 2*(1+4.0)/(4.0-3) * np.sqrt((4.0-2)/4.0)) + assert_equal(k, np.nan) + + m, v, s, k = stats.pareto.stats(4.5, moments='mvsk') + assert_allclose(m, 4.5 / 3.5) + assert_allclose(v, 4.5 / (3.5*3.5*2.5)) + assert_allclose(s, (2*5.5/1.5) * np.sqrt(2.5/4.5)) + assert_allclose(k, 6*(4.5**3 + 4.5**2 - 6*4.5 - 2)/(4.5*1.5*0.5)) + + +class TestPearson3(TestCase): + def test_rvs(self): + vals = stats.pearson3.rvs(0.1, size=(2, 50)) + assert_(numpy.shape(vals) == (2, 50)) + assert_(vals.dtype.char in typecodes['AllFloat']) + val = stats.pearson3.rvs(0.5) + assert_(isinstance(val, float)) + val = stats.pearson3(0.5).rvs(3) + assert_(isinstance(val, numpy.ndarray)) + assert_(val.dtype.char in typecodes['AllFloat']) + assert_(len(val) == 3) + + def test_pdf(self): + vals = stats.pearson3.pdf(2, [0.0, 0.1, 0.2]) + assert_allclose(vals, np.array([0.05399097, 0.05555481, 0.05670246]), + atol=1e-6) + vals = stats.pearson3.pdf(-3, 0.1) + assert_allclose(vals, np.array([0.00313791]), atol=1e-6) + vals = stats.pearson3.pdf([-3,-2,-1,0,1], 0.1) + assert_allclose(vals, np.array([0.00313791, 0.05192304, 0.25028092, + 0.39885918, 0.23413173]), atol=1e-6) + + def test_cdf(self): + vals = stats.pearson3.cdf(2, [0.0, 0.1, 0.2]) + assert_allclose(vals, np.array([0.97724987, 0.97462004, 0.97213626]), + atol=1e-6) + vals = stats.pearson3.cdf(-3, 0.1) + assert_allclose(vals, [0.00082256], atol=1e-6) + vals = stats.pearson3.cdf([-3,-2,-1,0,1], 0.1) + assert_allclose(vals, [8.22563821e-04, 1.99860448e-02, 1.58550710e-01, + 5.06649130e-01, 8.41442111e-01], atol=1e-6) + + class TestPoisson(TestCase): def test_rvs(self): vals = stats.poisson.rvs(0.5, size=(2, 50)) @@ -375,6 +491,12 @@ class TestPoisson(TestCase): assert_(isinstance(val, numpy.ndarray)) assert_(val.dtype.char in typecodes['AllInteger']) + def test_stats(self): + mu = 16.0 + result = stats.poisson.stats(mu, moments='mvsk') + assert_allclose(result, [mu, mu, np.sqrt(1.0/mu), 1.0/mu]) + + class TestZipf(TestCase): def test_rvs(self): vals = stats.zipf.rvs(1.5, size=(2, 50)) @@ -387,9 +509,19 @@ class TestZipf(TestCase): assert_(isinstance(val, numpy.ndarray)) assert_(val.dtype.char in typecodes['AllInteger']) + def test_moments(self): + # n-th moment is finite iff a > n + 1 + m, v = stats.zipf.stats(a=2.8) + assert_(np.isfinite(m)) + assert_equal(v, np.inf) + + s, k = stats.zipf.stats(a=4.8, moments='sk') + assert_(not np.isfinite([s, k]).all()) + + class TestDLaplace(TestCase): def test_rvs(self): - vals = stats.dlaplace.rvs(1.5 , size=(2, 50)) + vals = stats.dlaplace.rvs(1.5, size=(2, 50)) assert_(numpy.shape(vals) == (2, 50)) assert_(vals.dtype.char in typecodes['AllInteger']) val = stats.dlaplace.rvs(1.5) @@ -397,32 +529,78 @@ class TestDLaplace(TestCase): val = stats.dlaplace(1.5).rvs(3) assert_(isinstance(val, numpy.ndarray)) assert_(val.dtype.char in typecodes['AllInteger']) + assert_(stats.dlaplace.rvs(0.8) is not None) + + + def test_stats(self): + # compare the explicit formulas w/ direct summation using pmf + a = 1. + dl = stats.dlaplace(a) + m, v, s, k = dl.stats('mvsk') + + N = 37 + xx = np.arange(-N, N+1) + pp = dl.pmf(xx) + m2, m4 = np.sum(pp*xx**2), np.sum(pp*xx**4) + assert_equal((m, s), (0,0)) + assert_allclose((v, k), (m2, m4/m2**2 - 3.), atol=1e-14, rtol=1e-8) + + def test_stats2(self): + a = np.log(2.) + dl = stats.dlaplace(a) + m, v, s, k = dl.stats('mvsk') + assert_equal((m, s), (0.,0.)) + assert_allclose((v, k), (4., 3.25)) + + +class TestInvGamma(TestCase): + @dec.skipif(NumpyVersion(np.__version__) < '1.7.0', + "assert_* funcs broken with inf/nan") + def test_invgamma_inf_gh_1866(self): + # invgamma's moments are only finite for a>n + # specific numbers checked w/ boost 1.54 + with warnings.catch_warnings(): + warnings.simplefilter('error', RuntimeWarning) + mvsk = stats.invgamma.stats(a=19.31, moments='mvsk') + assert_allclose(mvsk, + [0.05461496450, 0.0001723162534, 1.020362676, 2.055616582]) + + a = [1.1, 3.1, 5.6] + mvsk = stats.invgamma.stats(a=a, moments='mvsk') + expected = ([10., 0.476190476, 0.2173913043], # mmm + [np.inf, 0.2061430632, 0.01312749422], # vvv + [np.nan, 41.95235392, 2.919025532], # sss + [np.nan, np.nan, 24.51923076]) # kkk + for x, y in zip(mvsk, expected): + assert_almost_equal(x, y) + + +class TestF(TestCase): + def test_f_moments(self): + # n-th moment of F distributions is only finite for n < dfd / 2 + m, v, s, k = stats.f.stats(11, 6.5, moments='mvsk') + assert_(np.isfinite(m)) + assert_(np.isfinite(v)) + assert_(np.isfinite(s)) + assert_(not np.isfinite(k)) + + def test_moments_warnings(self): + # no warnings should be generated for dfd = 2, 4, 6, 8 (div by zero) + with warnings.catch_warnings(): + warnings.simplefilter('error', RuntimeWarning) + stats.f.stats(dfn=[11]*4, dfd=[2, 4, 6, 8], moments='mvsk') + + @dec.knownfailureif(True, 'f stats does not properly broadcast') + def test_stats_broadcast(self): + # stats do not fully broadcast just yet + mv = stats.f.stats(dfn=11, dfd=[11, 12]) + def test_rvgeneric_std(): - """Regression test for #1191""" + # Regression test for #1191 assert_array_almost_equal(stats.t.std([5, 6]), [1.29099445, 1.22474487]) -def test_nan_arguments_ticket835(): - assert_(np.isnan(stats.t.logcdf(np.nan))) - assert_(np.isnan(stats.t.cdf(np.nan))) - assert_(np.isnan(stats.t.logsf(np.nan))) - assert_(np.isnan(stats.t.sf(np.nan))) - assert_(np.isnan(stats.t.pdf(np.nan))) - assert_(np.isnan(stats.t.logpdf(np.nan))) - assert_(np.isnan(stats.t.ppf(np.nan))) - assert_(np.isnan(stats.t.isf(np.nan))) - - pr = 0.5 - assert_(np.isnan(stats.bernoulli.logcdf(np.nan, pr))) - assert_(np.isnan(stats.bernoulli.cdf(np.nan, pr))) - assert_(np.isnan(stats.bernoulli.logsf(np.nan, pr))) - assert_(np.isnan(stats.bernoulli.sf(np.nan, pr))) - assert_(np.isnan(stats.bernoulli.pmf(np.nan, pr))) - assert_(np.isnan(stats.bernoulli.logpmf(np.nan, pr))) - assert_(np.isnan(stats.bernoulli.ppf(np.nan, pr))) - assert_(np.isnan(stats.bernoulli.isf(np.nan, pr))) - - + class TestRvDiscrete(TestCase): def test_rvs(self): states = [-1,0,1,2,3,4] @@ -438,13 +616,27 @@ class TestRvDiscrete(TestCase): x = r.rvs() assert_(isinstance(x, int)) + def test_entropy(self): + # Basic tests of entropy. + pvals = np.array([0.25, 0.45, 0.3]) + p = stats.rv_discrete(values=([0, 1, 2], pvals)) + expected_h = -sum(xlogy(pvals, pvals)) + h = p.entropy() + assert_allclose(h, expected_h) + + p = stats.rv_discrete(values=([0, 1, 2], [1.0, 0, 0])) + h = p.entropy() + assert_equal(h, 0.0) + + class TestExpon(TestCase): def test_zero(self): assert_equal(stats.expon.pdf(0),1) def test_tail(self): # Regression test for ticket 807 - assert_equal(stats.expon.cdf(1e-18), 1e-18) - assert_equal(stats.expon.isf(stats.expon.sf(40)), 40) + assert_equal(stats.expon.cdf(1e-18), 1e-18) + assert_equal(stats.expon.isf(stats.expon.sf(40)), 40) + class TestGenExpon(TestCase): def test_pdf_unity_area(self): @@ -459,15 +651,16 @@ class TestGenExpon(TestCase): cdf = stats.genexpon.cdf(numpy.arange(0, 10, 0.01), 0.5, 0.5, 2.0) assert_(numpy.all((0 <= cdf) & (cdf <= 1))) + class TestExponpow(TestCase): def test_tail(self): - assert_almost_equal(stats.exponpow.cdf(1e-10, 2.), 1e-20) - assert_almost_equal(stats.exponpow.isf(stats.exponpow.sf(5, .8), .8), 5) + assert_almost_equal(stats.exponpow.cdf(1e-10, 2.), 1e-20) + assert_almost_equal(stats.exponpow.isf(stats.exponpow.sf(5, .8), .8), 5) class TestSkellam(TestCase): def test_pmf(self): - #comparison to R + # comparison to R k = numpy.arange(-10, 15) mu1, mu2 = 10, 5 skpmfR = numpy.array( @@ -488,7 +681,7 @@ class TestSkellam(TestCase): assert_almost_equal(stats.skellam.pmf(k, mu1, mu2), skpmfR, decimal=15) def test_cdf(self): - #comparison to R, only 5 decimals + # comparison to R, only 5 decimals k = numpy.arange(-10, 15) mu1, mu2 = 10, 5 skcdfR = numpy.array( @@ -509,8 +702,40 @@ class TestSkellam(TestCase): assert_almost_equal(stats.skellam.cdf(k, mu1, mu2), skcdfR, decimal=5) -class TestGamma(TestCase): +class TestLognorm(TestCase): + def test_pdf(self): + # Regression test for Ticket #1471: avoid nan with 0/0 situation + with np.errstate(divide='ignore'): + pdf = stats.lognorm.pdf([0, 0.5, 1], 1) + assert_array_almost_equal(pdf, [0.0, 0.62749608, 0.39894228]) + + +class TestBeta(TestCase): + def test_logpdf(self): + # Regression test for Ticket #1326: avoid nan with 0*log(0) situation + logpdf = stats.beta.logpdf(0,1,0.5) + assert_almost_equal(logpdf, -0.69314718056) + logpdf = stats.beta.logpdf(0,0.5,1) + assert_almost_equal(logpdf, np.inf) + + def test_logpdf_ticket_1866(self): + alpha, beta = 267, 1472 + x = np.array([0.2, 0.5, 0.6]) + b = stats.beta(alpha, beta) + assert_allclose(b.logpdf(x).sum(), -1201.699061824062) + assert_allclose(b.pdf(x), np.exp(b.logpdf(x))) + +class TestBetaPrime(TestCase): + def test_logpdf(self): + alpha, beta = 267, 1472 + x = np.array([0.2, 0.5, 0.6]) + b = stats.betaprime(alpha, beta) + assert_(np.isfinite(b.logpdf(x)).all()) + assert_allclose(b.pdf(x), np.exp(b.logpdf(x))) + + +class TestGamma(TestCase): def test_pdf(self): # a few test cases to compare with R pdf = stats.gamma.pdf(90, 394, scale=1./5) @@ -519,17 +744,11 @@ class TestGamma(TestCase): pdf = stats.gamma.pdf(3, 10, scale=1./5) assert_almost_equal(pdf, 0.1620358) -class TestHypergeom2(TestCase): - def test_precision(self): - # comparison number from mpmath - M = 2500 - n = 50 - N = 500 - tot = M - good = n - hgpmf = stats.hypergeom.pmf(2, tot, good, N) - - assert_almost_equal(hgpmf, 0.0010114963068932233, 11) + def test_logpdf(self): + # Regression test for Ticket #1326: cornercase avoid nan with 0*log(0) + # situation + logpdf = stats.gamma.logpdf(0,1) + assert_almost_equal(logpdf, 0) class TestChi2(TestCase): @@ -538,28 +757,30 @@ class TestChi2(TestCase): assert_almost_equal(stats.chi2.pdf(1000, 1000), 8.919133934753128e-003, 14) assert_almost_equal(stats.chi2.pdf(100, 100), 0.028162503162596778, 14) -class TestArrayArgument(TestCase): #test for ticket:992 + +class TestArrayArgument(TestCase): # test for ticket:992 def test_noexception(self): rvs = stats.norm.rvs(loc=(np.arange(5)), scale=np.ones(5), size=(10,5)) assert_equal(rvs.shape, (10,5)) + class TestDocstring(TestCase): def test_docstrings(self): - """See ticket #761""" + # See ticket #761 if stats.rayleigh.__doc__ is not None: self.assertTrue("rayleigh" in stats.rayleigh.__doc__.lower()) if stats.bernoulli.__doc__ is not None: self.assertTrue("bernoulli" in stats.bernoulli.__doc__.lower()) def test_no_name_arg(self): - """If name is not given, construction shouldn't fail. See #1508.""" + # If name is not given, construction shouldn't fail. See #1508. stats.rv_continuous() stats.rv_discrete() class TestEntropy(TestCase): def test_entropy_positive(self): - """See ticket #497""" + # See ticket #497 pk = [0.5,0.2,0.3] qk = [0.1,0.25,0.65] eself = stats.entropy(pk,pk) @@ -567,6 +788,41 @@ class TestEntropy(TestCase): assert_(0.0 == eself) assert_(edouble >= 0.0) + def test_entropy_base(self): + pk = np.ones(16, float) + S = stats.entropy(pk, base=2.) + assert_(abs(S - 4.) < 1.e-5) + + qk = np.ones(16, float) + qk[:8] = 2. + S = stats.entropy(pk, qk) + S2 = stats.entropy(pk, qk, base=2.) + assert_(abs(S/S2 - np.log(2.)) < 1.e-5) + + def test_entropy_zero(self): + # Test for PR-479 + assert_almost_equal(stats.entropy([0, 1, 2]), 0.63651416829481278, + decimal=12) + + def test_entropy_2d(self): + pk = [[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]] + qk = [[0.2, 0.1], [0.3, 0.6], [0.5, 0.3]] + assert_array_almost_equal(stats.entropy(pk, qk), + [0.1933259, 0.18609809]) + + @dec.skipif(NumpyVersion(np.__version__) < '1.7.0', + "assert_* funcs broken with inf/nan") + def test_entropy_2d_zero(self): + pk = [[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]] + qk = [[0.0, 0.1], [0.3, 0.6], [0.5, 0.3]] + assert_array_almost_equal(stats.entropy(pk, qk), + [np.inf, 0.18609809]) + + pk[0][0] = 0.0 + assert_array_almost_equal(stats.entropy(pk, qk), + [0.17403988, 0.18609809]) + + def TestArgsreduce(): a = array([1,3,2,1,2,3,3]) b,c = argsreduce(a > 1, a, 2) @@ -583,13 +839,186 @@ def TestArgsreduce(): assert_array_equal(c, [2] * numpy.size(a)) +class TestFitMethod(object): + skip = ['ncf'] -class TestFrozen(TestCase): - """Test that a frozen distribution gives the same results as the original object. + @dec.slow + def test_fit(self): + def check(func, dist, args, alpha): + if dist in self.skip: + raise SkipTest("%s fit known to fail" % dist) + distfunc = getattr(stats, dist) + with np.errstate(all='ignore'): + res = distfunc.rvs(*args, **{'size':200}) + vals = distfunc.fit(res) + vals2 = distfunc.fit(res, optimizer='powell') + # Only check the length of the return + # FIXME: should check the actual results to see if we are 'close' + # to what was created --- but what is 'close' enough + if dist == 'frechet': + assert_(len(vals) == len(args)) + assert_(len(vals2) == len(args)) + else: + assert_(len(vals) == 2+len(args)) + assert_(len(vals2) == 2+len(args)) - Only tested for the normal distribution (with loc and scale specified) and for the - gamma distribution (with a shape parameter specified). - """ + for func, dist, args, alpha in test_all_distributions(): + yield check, func, dist, args, alpha + + @dec.slow + def test_fix_fit(self): + def check(func, dist, args, alpha): + # Not sure why 'ncf', and 'beta' are failing + # frechet has different len(args) than distfunc.numargs + if dist in self.skip + ['frechet']: + raise SkipTest("%s fit known to fail" % dist) + distfunc = getattr(stats, dist) + with np.errstate(all='ignore'): + res = distfunc.rvs(*args, **{'size':200}) + vals = distfunc.fit(res,floc=0) + vals2 = distfunc.fit(res,fscale=1) + assert_(len(vals) == 2+len(args)) + assert_(vals[-2] == 0) + assert_(vals2[-1] == 1) + assert_(len(vals2) == 2+len(args)) + if len(args) > 0: + vals3 = distfunc.fit(res, f0=args[0]) + assert_(len(vals3) == 2+len(args)) + assert_(vals3[0] == args[0]) + if len(args) > 1: + vals4 = distfunc.fit(res, f1=args[1]) + assert_(len(vals4) == 2+len(args)) + assert_(vals4[1] == args[1]) + if len(args) > 2: + vals5 = distfunc.fit(res, f2=args[2]) + assert_(len(vals5) == 2+len(args)) + assert_(vals5[2] == args[2]) + + for func, dist, args, alpha in test_all_distributions(): + yield check, func, dist, args, alpha + + def test_fix_fit_2args_lognorm(self): + # Regression test for #1551. + np.random.seed(12345) + with np.errstate(all='ignore'): + x = stats.lognorm.rvs(0.25, 0., 20.0, size=20) + assert_allclose(np.array(stats.lognorm.fit(x, floc=0, fscale=20)), + [0.25888672, 0, 20], atol=1e-5) + + def test_fix_fit_norm(self): + x = np.arange(1, 6) + + loc, scale = stats.norm.fit(x) + assert_almost_equal(loc, 3) + assert_almost_equal(scale, np.sqrt(2)) + + loc, scale = stats.norm.fit(x, floc=2) + assert_equal(loc, 2) + assert_equal(scale, np.sqrt(3)) + + loc, scale = stats.norm.fit(x, fscale=2) + assert_almost_equal(loc, 3) + assert_equal(scale, 2) + + def test_fix_fit_gamma(self): + x = np.arange(1, 6) + meanlog = np.log(x).mean() + + # A basic test of gamma.fit with floc=0. + floc = 0 + a, loc, scale = stats.gamma.fit(x, floc=floc) + s = np.log(x.mean()) - meanlog + assert_almost_equal(np.log(a) - special.digamma(a), s, decimal=5) + assert_equal(loc, floc) + assert_almost_equal(scale, x.mean()/a, decimal=8) + + # Regression tests for gh-2514. + # The problem was that if `floc=0` was given, any other fixed + # parameters were ignored. + f0 = 1 + floc = 0 + a, loc, scale = stats.gamma.fit(x, f0=f0, floc=floc) + assert_equal(a, f0) + assert_equal(loc, floc) + assert_almost_equal(scale, x.mean()/a, decimal=8) + + f0 = 2 + floc = 0 + a, loc, scale = stats.gamma.fit(x, f0=f0, floc=floc) + assert_equal(a, f0) + assert_equal(loc, floc) + assert_almost_equal(scale, x.mean()/a, decimal=8) + + # loc and scale fixed. + floc = 0 + fscale = 2 + a, loc, scale = stats.gamma.fit(x, floc=floc, fscale=fscale) + assert_equal(loc, floc) + assert_equal(scale, fscale) + c = meanlog - np.log(fscale) + assert_almost_equal(special.digamma(a), c) + + def test_fix_fit_beta(self): + # Test beta.fit when both floc and fscale are given. + + def mlefunc(a, b, x): + # Zeros of this function are critical points of + # the maximum likelihood function. + n = len(x) + s1 = np.log(x).sum() + s2 = np.log(1-x).sum() + psiab = special.psi(a + b) + func = [s1 - n * (-psiab + special.psi(a)), + s2 - n * (-psiab + special.psi(b))] + return func + + # Basic test with floc and fscale given. + x = np.array([0.125, 0.25, 0.5]) + a, b, loc, scale = stats.beta.fit(x, floc=0, fscale=1) + assert_equal(loc, 0) + assert_equal(scale, 1) + assert_allclose(mlefunc(a, b, x), [0,0], atol=1e-6) + + # Basic test with f0, floc and fscale given. + # This is also a regression test for gh-2514. + x = np.array([0.125, 0.25, 0.5]) + a, b, loc, scale = stats.beta.fit(x, f0=2, floc=0, fscale=1) + assert_equal(a, 2) + assert_equal(loc, 0) + assert_equal(scale, 1) + da, db = mlefunc(a, b, x) + assert_allclose(db, 0, atol=1e-5) + + # Same floc and fscale values as above, but reverse the data + # and fix b (f1). + x2 = 1 - x + a2, b2, loc2, scale2 = stats.beta.fit(x2, f1=2, floc=0, fscale=1) + assert_equal(b2, 2) + assert_equal(loc2, 0) + assert_equal(scale2, 1) + da, db = mlefunc(a2, b2, x2) + assert_allclose(da, 0, atol=1e-5) + # a2 of this test should equal b from above. + assert_almost_equal(a2, b) + + # Check for detection of data out of bounds when floc and fscale + # are given. + assert_raises(ValueError, stats.beta.fit, x, floc=0.5, fscale=1) + y = np.array([0, .5, 1]) + assert_raises(ValueError, stats.beta.fit, y, floc=0, fscale=1) + assert_raises(ValueError, stats.beta.fit, y, floc=0, fscale=1, f0=2) + assert_raises(ValueError, stats.beta.fit, y, floc=0, fscale=1, f1=2) + + # Check that attempting to fix all the parameters raises a ValueError. + assert_raises(ValueError, stats.beta.fit, y, f0=0, f1=1, + floc=2, fscale=3) + + +class TestFrozen(TestCase): + # Test that a frozen distribution gives the same results as the original object. + # + # Only tested for the normal distribution (with loc and scale specified) + # and for the gamma distribution (with a shape parameter specified). def test_norm(self): dist = stats.norm frozen = stats.norm(loc=10.0, scale=3.0) @@ -687,14 +1116,13 @@ class TestFrozen(TestCase): result = dist.moment(2, a) assert_equal(result_f, result) - def test_regression_02(self): - """Regression test for ticket #1293.""" + def test_regression_ticket_1293(self): # Create a frozen distribution. frozen = stats.lognorm(1) # Call one of its methods that does not take any keyword arguments. m1 = frozen.moment(2) # Now call a method that takes a keyword argument. - s = frozen.stats(moments='mvsk') + frozen.stats(moments='mvsk') # Call moment(2) again. # After calling stats(), the following was raising an exception. # So this test passes if the following does not raise an exception. @@ -703,13 +1131,12 @@ class TestFrozen(TestCase): # the focus of this test. assert_equal(m1, m2) -class TestExpect(TestCase): - """Test for expect method. - Uses normal distribution and beta distribution for finite bounds, and - hypergeom for discrete distribution with finite support - - """ +class TestExpect(TestCase): + # Test for expect method. + # + # Uses normal distribution and beta distribution for finite bounds, and + # hypergeom for discrete distribution with finite support def test_norm(self): v = stats.norm.expect(lambda x: (x-5)*(x-5), loc=5, scale=2) assert_almost_equal(v, 4, decimal=14) @@ -727,32 +1154,28 @@ class TestExpect(TestCase): assert_almost_equal(prob90c, 1., decimal=14) def test_beta(self): - #case with finite support interval -## >>> mtrue, vtrue = stats.beta.stats(10,5, loc=5., scale=2.) -## >>> mtrue, vtrue -## (array(6.333333333333333), array(0.055555555555555552)) + # case with finite support interval v = stats.beta.expect(lambda x: (x-19/3.)*(x-19/3.), args=(10,5), loc=5, scale=2) - assert_almost_equal(v, 1./18., decimal=14) + assert_almost_equal(v, 1./18., decimal=13) m = stats.beta.expect(lambda x: x, args=(10,5), loc=5., scale=2.) - assert_almost_equal(m, 19/3., decimal=14) + assert_almost_equal(m, 19/3., decimal=13) ub = stats.beta.ppf(0.95, 10, 10, loc=5, scale=2) lb = stats.beta.ppf(0.05, 10, 10, loc=5, scale=2) prob90 = stats.beta.expect(lambda x: 1., args=(10,10), loc=5., scale=2.,lb=lb, ub=ub, conditional=False) - assert_almost_equal(prob90, 0.9, decimal=14) + assert_almost_equal(prob90, 0.9, decimal=13) prob90c = stats.beta.expect(lambda x: 1, args=(10,10), loc=5, scale=2, lb=lb, ub=ub, conditional=True) - assert_almost_equal(prob90c, 1., decimal=14) - + assert_almost_equal(prob90c, 1., decimal=13) def test_hypergeom(self): - #test case with finite bounds + # test case with finite bounds - #without specifying bounds + # without specifying bounds m_true, v_true = stats.hypergeom.stats(20, 10, 8, loc=5.) m = stats.hypergeom.expect(lambda x: x, args=(20, 10, 8), loc=5.) assert_almost_equal(m, m_true, decimal=13) @@ -761,59 +1184,175 @@ class TestExpect(TestCase): loc=5.) assert_almost_equal(v, v_true, decimal=14) - #with bounds, bounds equal to shifted support + # with bounds, bounds equal to shifted support v_bounds = stats.hypergeom.expect(lambda x: (x-9.)**2, args=(20, 10, 8), loc=5., lb=5, ub=13) assert_almost_equal(v_bounds, v_true, decimal=14) - #drop boundary points + # drop boundary points prob_true = 1-stats.hypergeom.pmf([5, 13], 20, 10, 8, loc=5).sum() prob_bounds = stats.hypergeom.expect(lambda x: 1, args=(20, 10, 8), loc=5., lb=6, ub=12) assert_almost_equal(prob_bounds, prob_true, decimal=13) - #conditional + # conditional prob_bc = stats.hypergeom.expect(lambda x: 1, args=(20, 10, 8), loc=5., lb=6, ub=12, conditional=True) assert_almost_equal(prob_bc, 1, decimal=14) - #check simple integral + # check simple integral prob_b = stats.hypergeom.expect(lambda x: 1, args=(20, 10, 8), lb=0, ub=8) assert_almost_equal(prob_b, 1, decimal=13) def test_poisson(self): - #poisson, use lower bound only + # poisson, use lower bound only prob_bounds = stats.poisson.expect(lambda x: 1, args=(2,), lb=3, conditional=False) prob_b_true = 1-stats.poisson.cdf(2,2) assert_almost_equal(prob_bounds, prob_b_true, decimal=14) - prob_lb = stats.poisson.expect(lambda x: 1, args=(2,), lb=2, conditional=True) assert_almost_equal(prob_lb, 1, decimal=14) + def test_genhalflogistic(self): + # genhalflogistic, changes upper bound of support in _argcheck + # regression test for gh-2622 + halflog = stats.genhalflogistic + # check consistency when calling expect twice with the same input + res1 = halflog.expect(args=(1.5,)) + halflog.expect(args=(0.5,)) + res2 = halflog.expect(args=(1.5,)) + assert_almost_equal(res1, res2, decimal=14) + + def test_rice_overflow(self): + # rice.pdf(999, 0.74) was inf since special.i0 silentyly overflows + # check that using i0e fixes it + assert_(np.isfinite(stats.rice.pdf(999, 0.74))) + + assert_(np.isfinite(stats.rice.expect(lambda x: 1, args=(0.74,)))) + assert_(np.isfinite(stats.rice.expect(lambda x: 2, args=(0.74,)))) + assert_(np.isfinite(stats.rice.expect(lambda x: 3, args=(0.74,)))) + + +class TestNct(TestCase): + def test_nc_parameter(self): + # Parameter values c<=0 were not enabled (gh-2402). + # For negative values c and for c=0 results of rv.cdf(0) below were nan + rv = stats.nct(5, 0) + assert_equal(rv.cdf(0), 0.5) + rv = stats.nct(5, -1) + assert_almost_equal(rv.cdf(0), 0.841344746069, decimal=10) + + def test_broadcasting(self): + res = stats.nct.pdf(5, np.arange(4,7)[:,None], np.linspace(0.1, 1, 4)) + expected = array([[0.00321886, 0.00557466, 0.00918418, 0.01442997], + [0.00217142, 0.00395366, 0.00683888, 0.01126276], + [0.00153078, 0.00291093, 0.00525206, 0.00900815]]) + assert_allclose(res, expected, rtol=1e-5) + + def text_variance_gh_issue_2401(self): + # Computation of the variance of a non-central t-distribution resulted + # in a TypeError: ufunc 'isinf' not supported for the input types, + # and the inputs could not be safely coerced to any supported types + # according to the casting rule 'safe' + rv = stats.nct(4, 0) + assert_equal(rv.var(), 2.0) + + def test_nct_inf_moments(self): + # n-th moment of nct only exists for df > n + m, v, s, k = stats.nct.stats(df=1.9, nc=0.3, moments='mvsk') + assert_(np.isfinite(m)) + assert_equal([v, s, k], [np.inf, np.nan, np.nan]) + + m, v, s, k = stats.nct.stats(df=3.1, nc=0.3, moments='mvsk') + assert_(np.isfinite([m, v, s]).all()) + assert_equal(k, np.nan) + + +class TestRice(TestCase): + def test_rice_zero_b(self): + # rice distribution should work with b=0, cf gh-2164 + x = [0.2, 1., 5.] + assert_(np.isfinite(stats.rice.pdf(x, b=0.)).all()) + assert_(np.isfinite(stats.rice.logpdf(x, b=0.)).all()) + assert_(np.isfinite(stats.rice.cdf(x, b=0.)).all()) + assert_(np.isfinite(stats.rice.logcdf(x, b=0.)).all()) + + q = [0.1, 0.1, 0.5, 0.9] + assert_(np.isfinite(stats.rice.ppf(q, b=0.)).all()) + + mvsk = stats.rice.stats(0, moments='mvsk') + assert_(np.isfinite(mvsk).all()) + + # furthermore, pdf is continuous as b\to 0 + # rice.pdf(x, b\to 0) = x exp(-x^2/2) + O(b^2) + # see e.g. Abramovich & Stegun 9.6.7 & 9.6.10 + b = 1e-8 + assert_allclose(stats.rice.pdf(x, 0), stats.rice.pdf(x, b), + atol=b, rtol=0) + + def test_rice_rvs(self): + rvs = stats.rice.rvs + assert_equal(rvs(b=3.).size, 1) + assert_equal(rvs(b=3., size=(3, 5)).shape, (3, 5)) + + +class TestErlang(TestCase): + def test_erlang_runtimewarning(self): + # erlang should generate a RuntimeWarning if a non-integer + # shape parameter is used. + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + + # The non-integer shape parameter 1.3 should trigger a RuntimeWarning + assert_raises(RuntimeWarning, + stats.erlang.rvs, 1.3, loc=0, scale=1, size=4) + + # Calling the fit method with `f0` set to an integer should + # *not* trigger a RuntimeWarning. It should return the same + # values as gamma.fit(...). + data = [0.5, 1.0, 2.0, 4.0] + result_erlang = stats.erlang.fit(data, f0=1) + result_gamma = stats.gamma.fit(data, f0=1) + assert_allclose(result_erlang, result_gamma, rtol=1e-3) + + +class TestRdist(TestCase): + @dec.slow + def test_rdist_cdf_gh1285(self): + # check workaround in rdist._cdf for issue gh-1285. + distfn = stats.rdist + values = [0.001, 0.5, 0.999] + assert_almost_equal(distfn.cdf(distfn.ppf(values, 541.0), 541.0), + values, decimal=5) +def test_540_567(): + # test for nan returned in tickets 540, 567 + assert_almost_equal(stats.norm.cdf(-1.7624320982),0.03899815971089126, + decimal=10, err_msg='test_540_567') + assert_almost_equal(stats.norm.cdf(-1.7624320983),0.038998159702449846, + decimal=10, err_msg='test_540_567') + assert_almost_equal(stats.norm.cdf(1.38629436112, loc=0.950273420309, + scale=0.204423758009),0.98353464004309321, + decimal=10, err_msg='test_540_567') def test_regression_ticket_1316(): - """Regression test for ticket #1316.""" # The following was raising an exception, because _construct_default_doc() # did not handle the default keyword extradoc=None. See ticket #1316. - g = stats.distributions.gamma_gen(name='gamma') + g = stats._continuous_distns.gamma_gen(name='gamma') def test_regression_ticket_1326(): - """Regression test for ticket #1326.""" - #adjust to avoid nan with 0*log(0) + # adjust to avoid nan with 0*log(0) assert_almost_equal(stats.chi2.pdf(0.0, 2), 0.5, 14) def test_regression_tukey_lambda(): - """ Make sure that Tukey-Lambda distribution correctly handles non-positive lambdas. - """ + # Make sure that Tukey-Lambda distribution correctly handles non-positive lambdas. x = np.linspace(-5.0, 5.0, 101) olderr = np.seterr(divide='ignore') @@ -835,11 +1374,492 @@ def test_regression_tukey_lambda(): assert_((p[2] == 0.0).any()) +@dec.skipif(DOCSTRINGS_STRIPPED) def test_regression_ticket_1421(): - """Regression test for ticket #1421 - correction discrete docs.""" assert_('pdf(x, mu, loc=0, scale=1)' not in stats.poisson.__doc__) assert_('pmf(x,' in stats.poisson.__doc__) +def test_nan_arguments_gh_issue_1362(): + assert_(np.isnan(stats.t.logcdf(1, np.nan))) + assert_(np.isnan(stats.t.cdf(1, np.nan))) + assert_(np.isnan(stats.t.logsf(1, np.nan))) + assert_(np.isnan(stats.t.sf(1, np.nan))) + assert_(np.isnan(stats.t.pdf(1, np.nan))) + assert_(np.isnan(stats.t.logpdf(1, np.nan))) + assert_(np.isnan(stats.t.ppf(1, np.nan))) + assert_(np.isnan(stats.t.isf(1, np.nan))) + + assert_(np.isnan(stats.bernoulli.logcdf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.cdf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.logsf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.sf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.pmf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.logpmf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.ppf(np.nan, 0.5))) + assert_(np.isnan(stats.bernoulli.isf(np.nan, 0.5))) + + +def test_frozen_fit_ticket_1536(): + np.random.seed(5678) + true = np.array([0.25, 0., 0.5]) + x = stats.lognorm.rvs(true[0], true[1], true[2], size=100) + + olderr = np.seterr(divide='ignore') + try: + params = np.array(stats.lognorm.fit(x, floc=0.)) + finally: + np.seterr(**olderr) + + assert_almost_equal(params, true, decimal=2) + + params = np.array(stats.lognorm.fit(x, fscale=0.5, loc=0)) + assert_almost_equal(params, true, decimal=2) + + params = np.array(stats.lognorm.fit(x, f0=0.25, loc=0)) + assert_almost_equal(params, true, decimal=2) + + params = np.array(stats.lognorm.fit(x, f0=0.25, floc=0)) + assert_almost_equal(params, true, decimal=2) + + np.random.seed(5678) + loc = 1 + floc = 0.9 + x = stats.norm.rvs(loc, 2., size=100) + params = np.array(stats.norm.fit(x, floc=floc)) + expected = np.array([floc, np.sqrt(((x-floc)**2).mean())]) + assert_almost_equal(params, expected, decimal=4) + + +def test_regression_ticket_1530(): + # Check the starting value works for Cauchy distribution fit. + np.random.seed(654321) + rvs = stats.cauchy.rvs(size=100) + params = stats.cauchy.fit(rvs) + expected = (0.045, 1.142) + assert_almost_equal(params, expected, decimal=1) + + +def test_tukeylambda_stats_ticket_1545(): + # Some test for the variance and kurtosis of the Tukey Lambda distr. + # See test_tukeylamdba_stats.py for more tests. + + mv = stats.tukeylambda.stats(0, moments='mvsk') + # Known exact values: + expected = [0, np.pi**2/3, 0, 1.2] + assert_almost_equal(mv, expected, decimal=10) + + mv = stats.tukeylambda.stats(3.13, moments='mvsk') + # 'expected' computed with mpmath. + expected = [0, 0.0269220858861465102, 0, -0.898062386219224104] + assert_almost_equal(mv, expected, decimal=10) + + mv = stats.tukeylambda.stats(0.14, moments='mvsk') + # 'expected' computed with mpmath. + expected = [0, 2.11029702221450250, 0, -0.02708377353223019456] + assert_almost_equal(mv, expected, decimal=10) + + +def test_poisson_logpmf_ticket_1436(): + assert_(np.isfinite(stats.poisson.logpmf(1500, 200))) + + +def test_powerlaw_stats(): + """Test the powerlaw stats function. + + This unit test is also a regression test for ticket 1548. + + The exact values are: + mean: + mu = a / (a + 1) + variance: + sigma**2 = a / ((a + 2) * (a + 1) ** 2) + skewness: + One formula (see http://en.wikipedia.org/wiki/Skewness) is + gamma_1 = (E[X**3] - 3*mu*E[X**2] + 2*mu**3) / sigma**3 + A short calculation shows that E[X**k] is a / (a + k), so gamma_1 + can be implemented as + n = a/(a+3) - 3*(a/(a+1))*a/(a+2) + 2*(a/(a+1))**3 + d = sqrt(a/((a+2)*(a+1)**2)) ** 3 + gamma_1 = n/d + Either by simplifying, or by a direct calculation of mu_3 / sigma**3, + one gets the more concise formula: + gamma_1 = -2.0 * ((a - 1) / (a + 3)) * sqrt((a + 2) / a) + kurtosis: (See http://en.wikipedia.org/wiki/Kurtosis) + The excess kurtosis is + gamma_2 = mu_4 / sigma**4 - 3 + A bit of calculus and algebra (sympy helps) shows that + mu_4 = 3*a*(3*a**2 - a + 2) / ((a+1)**4 * (a+2) * (a+3) * (a+4)) + so + gamma_2 = 3*(3*a**2 - a + 2) * (a+2) / (a*(a+3)*(a+4)) - 3 + which can be rearranged to + gamma_2 = 6 * (a**3 - a**2 - 6*a + 2) / (a*(a+3)*(a+4)) + """ + cases = [(1.0, (0.5, 1./12, 0.0, -1.2)), + (2.0, (2./3, 2./36, -0.56568542494924734, -0.6))] + for a, exact_mvsk in cases: + mvsk = stats.powerlaw.stats(a, moments="mvsk") + assert_array_almost_equal(mvsk, exact_mvsk) + + +def test_ksone_fit_freeze(): + # Regression test for ticket #1638. + d = np.array( + [-0.18879233, 0.15734249, 0.18695107, 0.27908787, -0.248649, + -0.2171497, 0.12233512, 0.15126419, 0.03119282, 0.4365294, + 0.08930393, -0.23509903, 0.28231224, -0.09974875, -0.25196048, + 0.11102028, 0.1427649, 0.10176452, 0.18754054, 0.25826724, + 0.05988819, 0.0531668, 0.21906056, 0.32106729, 0.2117662, + 0.10886442, 0.09375789, 0.24583286, -0.22968366, -0.07842391, + -0.31195432, -0.21271196, 0.1114243, -0.13293002, 0.01331725, + -0.04330977, -0.09485776, -0.28434547, 0.22245721, -0.18518199, + -0.10943985, -0.35243174, 0.06897665, -0.03553363, -0.0701746, + -0.06037974, 0.37670779, -0.21684405]) + + try: + olderr = np.seterr(invalid='ignore') + with warnings.catch_warnings(): + warnings.simplefilter('ignore', UserWarning) + warnings.simplefilter('ignore', RuntimeWarning) + stats.ksone.fit(d) + finally: + np.seterr(**olderr) + + +def test_norm_logcdf(): + # Test precision of the logcdf of the normal distribution. + # This precision was enhanced in ticket 1614. + x = -np.asarray(list(range(0, 120, 4))) + # Values from R + expected = [-0.69314718, -10.36010149, -35.01343716, -75.41067300, + -131.69539607, -203.91715537, -292.09872100, -396.25241451, + -516.38564863, -652.50322759, -804.60844201, -972.70364403, + -1156.79057310, -1356.87055173, -1572.94460885, -1805.01356068, + -2053.07806561, -2317.13866238, -2597.19579746, -2893.24984493, + -3205.30112136, -3533.34989701, -3877.39640444, -4237.44084522, + -4613.48339520, -5005.52420869, -5413.56342187, -5837.60115548, + -6277.63751711, -6733.67260303] + + olderr = np.seterr(divide='ignore') + try: + assert_allclose(stats.norm().logcdf(x), expected, atol=1e-8) + finally: + np.seterr(**olderr) + + +def test_hypergeom_interval_1802(): + # these two had endless loops + assert_equal(stats.hypergeom.interval(.95, 187601, 43192, 757), + (152.0, 197.0)) + assert_equal(stats.hypergeom.interval(.945, 187601, 43192, 757), + (152.0, 197.0)) + # this was working also before + assert_equal(stats.hypergeom.interval(.94, 187601, 43192, 757), + (153.0, 196.0)) + + # degenerate case .a == .b + assert_equal(stats.hypergeom.ppf(0.02, 100, 100, 8), 8) + assert_equal(stats.hypergeom.ppf(1, 100, 100, 8), 8) + + +def test_distribution_too_many_args(): + # Check that a TypeError is raised when too many args are given to a method + # Regression test for ticket 1815. + x = np.linspace(0.1, 0.7, num=5) + assert_raises(TypeError, stats.gamma.pdf, x, 2, 3, loc=1.0) + assert_raises(TypeError, stats.gamma.pdf, x, 2, 3, 4, loc=1.0) + assert_raises(TypeError, stats.gamma.pdf, x, 2, 3, 4, 5) + assert_raises(TypeError, stats.gamma.pdf, x, 2, 3, loc=1.0, scale=0.5) + assert_raises(TypeError, stats.gamma.rvs, 2., 3, loc=1.0, scale=0.5) + assert_raises(TypeError, stats.gamma.cdf, x, 2., 3, loc=1.0, scale=0.5) + assert_raises(TypeError, stats.gamma.ppf, x, 2., 3, loc=1.0, scale=0.5) + assert_raises(TypeError, stats.gamma.stats, 2., 3, loc=1.0, scale=0.5) + assert_raises(TypeError, stats.gamma.entropy, 2., 3, loc=1.0, scale=0.5) + assert_raises(TypeError, stats.gamma.fit, x, 2., 3, loc=1.0, scale=0.5) + + # These should not give errors + stats.gamma.pdf(x, 2, 3) # loc=3 + stats.gamma.pdf(x, 2, 3, 4) # loc=3, scale=4 + stats.gamma.stats(2., 3) + stats.gamma.stats(2., 3, 4) + stats.gamma.stats(2., 3, 4, 'mv') + stats.gamma.rvs(2., 3, 4, 5) + stats.gamma.fit(stats.gamma.rvs(2., size=7), 2.) + + # Also for a discrete distribution + stats.geom.pmf(x, 2, loc=3) # no error, loc=3 + assert_raises(TypeError, stats.geom.pmf, x, 2, 3, 4) + assert_raises(TypeError, stats.geom.pmf, x, 2, 3, loc=4) + + # And for distributions with 0, 2 and 3 args respectively + assert_raises(TypeError, stats.expon.pdf, x, 3, loc=1.0) + assert_raises(TypeError, stats.exponweib.pdf, x, 3, 4, 5, loc=1.0) + assert_raises(TypeError, stats.exponweib.pdf, x, 3, 4, 5, 0.1, 0.1) + assert_raises(TypeError, stats.ncf.pdf, x, 3, 4, 5, 6, loc=1.0) + assert_raises(TypeError, stats.ncf.pdf, x, 3, 4, 5, 6, 1.0, scale=0.5) + stats.ncf.pdf(x, 3, 4, 5, 6, 1.0) # 3 args, plus loc/scale + + +def test_ncx2_tails_ticket_955(): + # Trac #955 -- check that the cdf computed by special functions + # matches the integrated pdf + a = stats.ncx2.cdf(np.arange(20, 25, 0.2), 2, 1.07458615e+02) + b = stats.ncx2.veccdf(np.arange(20, 25, 0.2), 2, 1.07458615e+02) + assert_allclose(a, b, rtol=1e-3, atol=0) + + +def test_foldnorm_zero(): + # Parameter value c=0 was not enabled, see gh-2399. + rv = stats.foldnorm(0, scale=1) + assert_equal(rv.cdf(0), 0) # rv.cdf(0) previously resulted in: nan + + +def test_stats_shapes_argcheck(): + # stats method was failing for vector shapes if some of the values + # were outside of the allowed range, see gh-2678 + mv3 = stats.invgamma.stats([0.0, 0.5, 1.0], 1, 0.5) # 0 is not a legal `a` + mv2 = stats.invgamma.stats([0.5, 1.0], 1, 0.5) + mv2_augmented = tuple(np.r_[np.nan, _] for _ in mv2) + assert_equal(mv2_augmented, mv3) + + mv3 = stats.lognorm.stats([2, 2.4, -1]) # -1 is not a legal shape parameter + mv2 = stats.lognorm.stats([2, 2.4]) + mv2_augmented = tuple(np.r_[_, np.nan] for _ in mv2) + assert_equal(mv2_augmented, mv3) + + # FIXME: this is only a quick-and-dirty test of a quick-and-dirty bugfix. + # stats method with multiple shape parameters is not properly vectorized + # anyway, so some distributions may or may not fail. + + +## Test subclassing distributions w/ explicit shapes + +class _distr_gen(stats.rv_continuous): + def _pdf(self, x, a): + return 42 + + +class _distr2_gen(stats.rv_continuous): + def _cdf(self, x, a): + return 42 * a + x + + +class _distr3_gen(stats.rv_continuous): + def _pdf(self, x, a, b): + return a + b + + def _cdf(self, x, a): + # Different # of shape params from _pdf, to be able to check that + # inspection catches the inconsistency.""" + return 42 * a + x + + +class _distr6_gen(stats.rv_continuous): + # Two shape parameters (both _pdf and _cdf defined, consistent shapes.) + def _pdf(self, x, a, b): + return a*x + b + + def _cdf(self, x, a, b): + return 42 * a + x + + +class TestSubclassingExplicitShapes(TestCase): + # Construct a distribution w/ explicit shapes parameter and test it. + + def test_correct_shapes(self): + dummy_distr = _distr_gen(name='dummy', shapes='a') + assert_equal(dummy_distr.pdf(1, a=1), 42) + + def test_wrong_shapes_1(self): + dummy_distr = _distr_gen(name='dummy', shapes='A') + assert_raises(TypeError, dummy_distr.pdf, 1, **dict(a=1)) + + def test_wrong_shapes_2(self): + dummy_distr = _distr_gen(name='dummy', shapes='a, b, c') + dct = dict(a=1, b=2, c=3) + assert_raises(TypeError, dummy_distr.pdf, 1, **dct) + + def test_shapes_string(self): + # shapes must be a string + dct = dict(name='dummy', shapes=42) + assert_raises(TypeError, _distr_gen, **dct) + + def test_shapes_identifiers_1(self): + # shapes must be a comma-separated list of valid python identifiers + dct = dict(name='dummy', shapes='(!)') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_identifiers_2(self): + dct = dict(name='dummy', shapes='4chan') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_identifiers_3(self): + dct = dict(name='dummy', shapes='m(fti)') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_identifiers_nodefaults(self): + dct = dict(name='dummy', shapes='a=2') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_args(self): + dct = dict(name='dummy', shapes='*args') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_kwargs(self): + dct = dict(name='dummy', shapes='**kwargs') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_keywords(self): + # python keywords cannot be used for shape parameters + dct = dict(name='dummy', shapes='a, b, c, lambda') + assert_raises(SyntaxError, _distr_gen, **dct) + + def test_shapes_signature(self): + # test explicit shapes which agree w/ the signature of _pdf + class _dist_gen(stats.rv_continuous): + def _pdf(self, x, a): + return stats.norm._pdf(x) * a + + dist = _dist_gen(shapes='a') + assert_equal(dist.pdf(0.5, a=2), stats.norm.pdf(0.5)*2) + + def test_shapes_signature_inconsistent(self): + # test explicit shapes which do not agree w/ the signature of _pdf + class _dist_gen(stats.rv_continuous): + def _pdf(self, x, a): + return stats.norm._pdf(x) * a + + dist = _dist_gen(shapes='a, b') + assert_raises(TypeError, dist.pdf, 0.5, **dict(a=1, b=2)) + + def test_star_args(self): + # test _pdf with only starargs + # NB: **kwargs of pdf will never reach _pdf + class _dist_gen(stats.rv_continuous): + def _pdf(self, x, *args): + extra_kwarg = args[0] + return stats.norm._pdf(x) * extra_kwarg + + dist = _dist_gen(shapes='extra_kwarg') + assert_equal(dist.pdf(0.5, extra_kwarg=33), stats.norm.pdf(0.5)*33) + assert_equal(dist.pdf(0.5, 33), stats.norm.pdf(0.5)*33) + assert_raises(TypeError, dist.pdf, 0.5, **dict(xxx=33)) + + def test_star_args_2(self): + # test _pdf with named & starargs + # NB: **kwargs of pdf will never reach _pdf + class _dist_gen(stats.rv_continuous): + def _pdf(self, x, offset, *args): + extra_kwarg = args[0] + return stats.norm._pdf(x) * extra_kwarg + offset + + dist = _dist_gen(shapes='offset, extra_kwarg') + assert_equal(dist.pdf(0.5, offset=111, extra_kwarg=33), + stats.norm.pdf(0.5)*33 + 111) + assert_equal(dist.pdf(0.5, 111, 33), + stats.norm.pdf(0.5)*33 + 111) + + def test_extra_kwarg(self): + # **kwargs to _pdf are ignored. + # this is a limitation of the framework (_pdf(x, *goodargs)) + class _distr_gen(stats.rv_continuous): + def _pdf(self, x, *args, **kwargs): + # _pdf should handle *args, **kwargs itself. Here "handling" is + # ignoring *args and looking for ``extra_kwarg`` and using that. + extra_kwarg = kwargs.pop('extra_kwarg', 1) + return stats.norm._pdf(x) * extra_kwarg + + dist = _distr_gen(shapes='extra_kwarg') + assert_equal(dist.pdf(1, extra_kwarg=3), stats.norm.pdf(1)) + + def shapes_empty_string(self): + # shapes='' is equivalent to shapes=None + class _dist_gen(stats.rv_continuous): + def _pdf(self, x): + return stats.norm.pdf(x) + + dist = _dist_gen(shapes='') + assert_equal(dist.pdf(0.5), stats.norm.pdf(0.5)) + + +class TestSubclassingNoShapes(TestCase): + # Construct a distribution w/o explicit shapes parameter and test it. + + def test_only__pdf(self): + dummy_distr = _distr_gen(name='dummy') + assert_equal(dummy_distr.pdf(1, a=1), 42) + + def test_only__cdf(self): + # _pdf is determined from _cdf by taking numerical derivative + dummy_distr = _distr2_gen(name='dummy') + assert_almost_equal(dummy_distr.pdf(1, a=1), 1) + + @dec.skipif(DOCSTRINGS_STRIPPED) + def test_signature_inspection(self): + # check that _pdf signature inspection works correctly, and is used in + # the class docstring + dummy_distr = _distr_gen(name='dummy') + assert_equal(dummy_distr.numargs, 1) + assert_equal(dummy_distr.shapes, 'a') + res = re.findall('logpdf\(x, a, loc=0, scale=1\)', + dummy_distr.__doc__) + assert_(len(res) == 1) + + @dec.skipif(DOCSTRINGS_STRIPPED) + def test_signature_inspection_2args(self): + # same for 2 shape params and both _pdf and _cdf defined + dummy_distr = _distr6_gen(name='dummy') + assert_equal(dummy_distr.numargs, 2) + assert_equal(dummy_distr.shapes, 'a, b') + res = re.findall('logpdf\(x, a, b, loc=0, scale=1\)', + dummy_distr.__doc__) + assert_(len(res) == 1) + + def test_signature_inspection_2args_incorrect_shapes(self): + # both _pdf and _cdf defined, but shapes are inconsistent: raises + try: + _distr3_gen(name='dummy') + except TypeError: + pass + else: + raise AssertionError('TypeError not raised.') + + def test_defaults_raise(self): + # default arguments should raise + class _dist_gen(stats.rv_continuous): + def _pdf(self, x, a=42): + return 42 + assert_raises(TypeError, _dist_gen, **dict(name='dummy')) + + def test_starargs_raise(self): + # without explicit shapes, *args are not allowed + class _dist_gen(stats.rv_continuous): + def _pdf(self, x, a, *args): + return 42 + assert_raises(TypeError, _dist_gen, **dict(name='dummy')) + + def test_kwargs_raise(self): + # without explicit shapes, **kwargs are not allowed + class _dist_gen(stats.rv_continuous): + def _pdf(self, x, a, **kwargs): + return 42 + assert_raises(TypeError, _dist_gen, **dict(name='dummy')) + + +@dec.skipif(DOCSTRINGS_STRIPPED) +def test_docstrings(): + badones = [',\s*,', '\(\s*,', '^\s*:'] + for distname in stats.__all__: + dist = getattr(stats, distname) + if isinstance(dist, (stats.rv_discrete, stats.rv_continuous)): + for regex in badones: + assert_(re.search(regex, dist.__doc__) is None) + + +def test_infinite_input(): + assert_almost_equal(stats.skellam.sf(np.inf, 10, 11), 0) + assert_almost_equal(stats.ncx2._cdf(np.inf, 8, 0.1), 1) + + if __name__ == "__main__": run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_fit.py b/pywafo/src/wafo/stats/tests/test_fit.py index 68d5793..be81d2c 100644 --- a/pywafo/src/wafo/stats/tests/test_fit.py +++ b/pywafo/src/wafo/stats/tests/test_fit.py @@ -1,98 +1,100 @@ -# NOTE: contains only one test, _est_cont_fit, that is renamed so that -# nose doesn't run it -# I put this here for the record and for the case when someone wants to -# verify the quality of fit -# with current parameters: relatively small sample size, default starting values -# Ran 84 tests in 401.797s -# FAILED (failures=15) -# -#Ran 83 tests in 238.859s -#FAILED (failures=12) - from __future__ import division, print_function, absolute_import -import numpy.testing as npt +import os + import numpy as np +from numpy.testing import dec from wafo import stats -from test_continuous_basic import distcont +from .test_continuous_basic import distcont # this is not a proper statistical test for convergence, but only # verifies that the estimate and true values don't differ by too much -n_repl1 = 1000 # sample size for first run -n_repl2 = 5000 # sample size for second run, if first run fails -thresh_percent = 0.25 # percent of true parameters for fail cut-off + +fit_sizes = [1000, 5000] # sample sizes to try +thresh_percent = 0.25 # percent of true parameters for fail cut-off thresh_min = 0.75 # minimum difference estimate - true to fail test +failing_fits = [ + 'burr', + 'chi', + 'chi2', + 'gausshyper', + 'genexpon', + 'gengamma', + 'ksone', + 'mielke', + 'ncf', + 'ncx2', + 'pearson3', + 'powerlognorm', + 'truncexpon', + 'tukeylambda', + 'vonmises', + 'wrapcauchy', +] -distslow = [ 'ncx2', 'rdist', 'gausshyper', 'recipinvgauss', 'ksone', 'genexpon', - 'vonmises', 'rice', 'mielke', - 'powerlognorm', 'kstwobign', 'tukeylambda','betaprime', 'gengamma', - 'johnsonsb', 'burr', 'truncexpon', 'pearson3', 'exponweib', 'nakagami', - 'wrapcauchy'] -dist_rarely_fitted = ['f', 'ncf', 'nct', 'chi'] -distskip = distslow + dist_rarely_fitted +# Don't run the fit test on these: +skip_fit = [ + 'erlang', # Subclass of gamma, generates a warning. +] -#distcont = [['genextreme', (3.3184017469423535,)]] -#@npt.dec.slow -def test_cont_fit(): - # this tests the closeness of the estimated parameters to the true - # parameters with fit method of continuous distributions - for distname, arg in distcont: - if distname not in distskip: - yield check_cont_fit, distname,arg -@npt.dec.slow -def _est_cont_fit_slow(): +@dec.slow +def test_cont_fit(): # this tests the closeness of the estimated parameters to the true # parameters with fit method of continuous distributions # Note: is slow, some distributions don't converge with sample size <= 10000 + for distname, arg in distcont: - if distname in distslow: + if distname not in skip_fit: yield check_cont_fit, distname,arg -def test_lognorm_fit_ticket1131(): - params = [(2.1, 1.,1.), (1.,10.,1.), (1.,1.,10.)] - for param in params: - yield check_cont_fit, 'lognorm', param -def check_cont_fit(distname,arg): +def check_cont_fit(distname,arg): + if distname in failing_fits: + # Skip failing fits unless overridden + xfail = True + try: + xfail = not int(os.environ['SCIPY_XFAIL']) + except: + pass + if xfail: + msg = "Fitting %s doesn't work reliably yet" % distname + msg += " [Set environment variable SCIPY_XFAIL=1 to run this test nevertheless.]" + dec.knownfailureif(True, msg)(lambda: None)() + distfn = getattr(stats, distname) - rvs = distfn.rvs(size=n_repl1,*arg) - est = distfn.fit(rvs) #, *arg) # start with default values - n = distfn.numargs + 2 - truearg = np.hstack([arg,[0.0, 1.0]])[:n] - - diff = est-truearg - - txt = '' + + truearg = np.hstack([arg,[0.0,1.0]]) diffthreshold = np.max(np.vstack([truearg*thresh_percent, - np.ones(distfn.numargs+2)*thresh_min]),0) - # threshold for location - diffthreshold[-2] = np.max([np.abs(rvs.mean())*thresh_percent,thresh_min]) - - if np.any(np.isnan(est)): - raise AssertionError('nan returned in fit') + np.ones(distfn.numargs+2)*thresh_min]),0) + + for fit_size in fit_sizes: + # Note that if a fit succeeds, the other fit_sizes are skipped + np.random.seed(1234) + + with np.errstate(all='ignore'): + rvs = distfn.rvs(size=fit_size, *arg) + est = distfn.fit(rvs) # start with default values + + diff = est - truearg + + # threshold for location + diffthreshold[-2] = np.max([np.abs(rvs.mean())*thresh_percent,thresh_min]) + + if np.any(np.isnan(est)): + raise AssertionError('nan returned in fit') + else: + if np.all(np.abs(diff) <= diffthreshold): + break else: - if np.any((np.abs(diff) - diffthreshold) > 0.0): -## txt = 'WARNING - diff too large with small sample' -## print 'parameter diff =', diff - diffthreshold, txt - rvs = np.concatenate([rvs,distfn.rvs(size=n_repl2-n_repl1,*arg)]) - est = distfn.fit(rvs) #,*arg) - truearg = np.hstack([arg,[0.0,1.0]])[:n] - diff = est-truearg - if np.any((np.abs(diff) - diffthreshold) > 0.0): - txt = 'parameter: %s\n' % str(truearg) - txt += 'estimated: %s\n' % str(est) - txt += 'diff : %s\n' % str(diff) - raise AssertionError('fit not very good in %s\n' % distfn.name + txt) - - -if __name__ == "__main__": - check_cont_fit('bradford', (0.29891359763170633,)) -# check_cont_fit('lognorm', (10,1,1)) -# check_cont_fit('ncx2', (21, 1.0560465975116415)) - import nose - #nose.run(argv=['', __file__]) - nose.runmodule(argv=[__file__,'-s'], exit=False) + txt = 'parameter: %s\n' % str(truearg) + txt += 'estimated: %s\n' % str(est) + txt += 'diff : %s\n' % str(diff) + raise AssertionError('fit not very good in %s\n' % distfn.name + txt) + + +if __name__ == "__main__": + np.testing.run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_kdeoth.py b/pywafo/src/wafo/stats/tests/test_kdeoth.py index ee822b7..198f24c 100644 --- a/pywafo/src/wafo/stats/tests/test_kdeoth.py +++ b/pywafo/src/wafo/stats/tests/test_kdeoth.py @@ -1,9 +1,10 @@ - - +from __future__ import division, print_function, absolute_import from wafo import stats import numpy as np -from numpy.testing import assert_almost_equal, assert_ +from numpy.testing import assert_almost_equal, assert_, assert_raises, \ + assert_array_almost_equal, assert_array_almost_equal_nulp, run_module_suite + def test_kde_1d(): #some basic tests comparing to normal distribution @@ -15,13 +16,13 @@ def test_kde_1d(): # get kde for original sample gkde = stats.gaussian_kde(xn) - - # evaluate the density funtion for the kde for some points + + # evaluate the density function for the kde for some points xs = np.linspace(-7,7,501) kdepdf = gkde.evaluate(xs) normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd) intervall = xs[1] - xs[0] - + assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01) prob1 = gkde.integrate_box_1d(xnmean, np.inf) prob2 = gkde.integrate_box_1d(-np.inf, xnmean) @@ -29,8 +30,155 @@ def test_kde_1d(): assert_almost_equal(prob2, 0.5, decimal=1) assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13) assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13) - + assert_almost_equal(gkde.integrate_kde(gkde), (kdepdf**2).sum()*intervall, decimal=2) assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2), - (kdepdf*normpdf).sum()*intervall, decimal=2) + (kdepdf*normpdf).sum()*intervall, decimal=2) + + +def test_kde_bandwidth_method(): + def scotts_factor(kde_obj): + """Same as default, just check that it works.""" + return np.power(kde_obj.n, -1./(kde_obj.d+4)) + + np.random.seed(8765678) + n_basesample = 50 + xn = np.random.randn(n_basesample) + + # Default + gkde = stats.gaussian_kde(xn) + # Supply a callable + gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor) + # Supply a scalar + gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor) + + xs = np.linspace(-7,7,51) + kdepdf = gkde.evaluate(xs) + kdepdf2 = gkde2.evaluate(xs) + assert_almost_equal(kdepdf, kdepdf2) + kdepdf3 = gkde3.evaluate(xs) + assert_almost_equal(kdepdf, kdepdf3) + + assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring') + + +# Subclasses that should stay working (extracted from various sources). +# Unfortunately the earlier design of gaussian_kde made it necessary for users +# to create these kinds of subclasses, or call _compute_covariance() directly. + +class _kde_subclass1(stats.gaussian_kde): + def __init__(self, dataset): + self.dataset = np.atleast_2d(dataset) + self.d, self.n = self.dataset.shape + self.covariance_factor = self.scotts_factor + self._compute_covariance() + + +class _kde_subclass2(stats.gaussian_kde): + def __init__(self, dataset): + self.covariance_factor = self.scotts_factor + super(_kde_subclass2, self).__init__(dataset) + + +class _kde_subclass3(stats.gaussian_kde): + def __init__(self, dataset, covariance): + self.covariance = covariance + stats.gaussian_kde.__init__(self, dataset) + + def _compute_covariance(self): + self.inv_cov = np.linalg.inv(self.covariance) + self._norm_factor = np.sqrt(np.linalg.det(2*np.pi * self.covariance)) \ + * self.n + + +class _kde_subclass4(stats.gaussian_kde): + def covariance_factor(self): + return 0.5 * self.silverman_factor() + + +def test_gaussian_kde_subclassing(): + x1 = np.array([-7, -5, 1, 4, 5], dtype=np.float) + xs = np.linspace(-10, 10, num=50) + + # gaussian_kde itself + kde = stats.gaussian_kde(x1) + ys = kde(xs) + + # subclass 1 + kde1 = _kde_subclass1(x1) + y1 = kde1(xs) + assert_array_almost_equal_nulp(ys, y1, nulp=10) + + # subclass 2 + kde2 = _kde_subclass2(x1) + y2 = kde2(xs) + assert_array_almost_equal_nulp(ys, y2, nulp=10) + + # subclass 3 + kde3 = _kde_subclass3(x1, kde.covariance) + y3 = kde3(xs) + assert_array_almost_equal_nulp(ys, y3, nulp=10) + + # subclass 4 + kde4 = _kde_subclass4(x1) + y4 = kde4(x1) + y_expected = [0.06292987, 0.06346938, 0.05860291, 0.08657652, 0.07904017] + + assert_array_almost_equal(y_expected, y4, decimal=6) + + # Not a subclass, but check for use of _compute_covariance() + kde5 = kde + kde5.covariance_factor = lambda: kde.factor + kde5._compute_covariance() + y5 = kde5(xs) + assert_array_almost_equal_nulp(ys, y5, nulp=10) + + +def test_gaussian_kde_covariance_caching(): + x1 = np.array([-7, -5, 1, 4, 5], dtype=np.float) + xs = np.linspace(-10, 10, num=5) + # These expected values are from scipy 0.10, before some changes to + # gaussian_kde. They were not compared with any external reference. + y_expected = [0.02463386, 0.04689208, 0.05395444, 0.05337754, 0.01664475] + + # Set the bandwidth, then reset it to the default. + kde = stats.gaussian_kde(x1) + kde.set_bandwidth(bw_method=0.5) + kde.set_bandwidth(bw_method='scott') + y2 = kde(xs) + + assert_array_almost_equal(y_expected, y2, decimal=7) + + +def test_gaussian_kde_monkeypatch(): + """Ugly, but people may rely on this. See scipy pull request 123, + specifically the linked ML thread "Width of the Gaussian in stats.kde". + If it is necessary to break this later on, that is to be discussed on ML. + """ + x1 = np.array([-7, -5, 1, 4, 5], dtype=np.float) + xs = np.linspace(-10, 10, num=50) + + # The old monkeypatched version to get at Silverman's Rule. + kde = stats.gaussian_kde(x1) + kde.covariance_factor = kde.silverman_factor + kde._compute_covariance() + y1 = kde(xs) + + # The new saner version. + kde2 = stats.gaussian_kde(x1, bw_method='silverman') + y2 = kde2(xs) + + assert_array_almost_equal_nulp(y1, y2, nulp=10) + + +def test_kde_integer_input(): + """Regression test for #1181.""" + x1 = np.arange(5) + kde = stats.gaussian_kde(x1) + y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869, 0.13480721] + assert_array_almost_equal(kde(x1), y_expected, decimal=6) + + +if __name__ == "__main__": + run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_morestats.py b/pywafo/src/wafo/stats/tests/test_morestats.py new file mode 100644 index 0000000..88f00b9 --- /dev/null +++ b/pywafo/src/wafo/stats/tests/test_morestats.py @@ -0,0 +1,789 @@ +# Author: Travis Oliphant, 2002 +# +# Further enhancements and tests added by numerous SciPy developers. +# +from __future__ import division, print_function, absolute_import + +import warnings + +import numpy as np +from numpy.random import RandomState +from numpy.testing import (TestCase, run_module_suite, assert_array_equal, + assert_almost_equal, assert_array_less, assert_array_almost_equal, + assert_raises, assert_, assert_allclose, assert_equal, dec) + +from scipy import stats + +# Matplotlib is not a scipy dependency but is optionally used in probplot, so +# check if it's available +try: + import matplotlib.pyplot as plt + have_matplotlib = True +except: + have_matplotlib = False + + +g1 = [1.006, 0.996, 0.998, 1.000, 0.992, 0.993, 1.002, 0.999, 0.994, 1.000] +g2 = [0.998, 1.006, 1.000, 1.002, 0.997, 0.998, 0.996, 1.000, 1.006, 0.988] +g3 = [0.991, 0.987, 0.997, 0.999, 0.995, 0.994, 1.000, 0.999, 0.996, 0.996] +g4 = [1.005, 1.002, 0.994, 1.000, 0.995, 0.994, 0.998, 0.996, 1.002, 0.996] +g5 = [0.998, 0.998, 0.982, 0.990, 1.002, 0.984, 0.996, 0.993, 0.980, 0.996] +g6 = [1.009, 1.013, 1.009, 0.997, 0.988, 1.002, 0.995, 0.998, 0.981, 0.996] +g7 = [0.990, 1.004, 0.996, 1.001, 0.998, 1.000, 1.018, 1.010, 0.996, 1.002] +g8 = [0.998, 1.000, 1.006, 1.000, 1.002, 0.996, 0.998, 0.996, 1.002, 1.006] +g9 = [1.002, 0.998, 0.996, 0.995, 0.996, 1.004, 1.004, 0.998, 0.999, 0.991] +g10 = [0.991, 0.995, 0.984, 0.994, 0.997, 0.997, 0.991, 0.998, 1.004, 0.997] + + +class TestShapiro(TestCase): + def test_basic(self): + x1 = [0.11,7.87,4.61,10.14,7.95,3.14,0.46, + 4.43,0.21,4.75,0.71,1.52,3.24, + 0.93,0.42,4.97,9.53,4.55,0.47,6.66] + w,pw = stats.shapiro(x1) + assert_almost_equal(w,0.90047299861907959,6) + assert_almost_equal(pw,0.042089745402336121,6) + x2 = [1.36,1.14,2.92,2.55,1.46,1.06,5.27,-1.11, + 3.48,1.10,0.88,-0.51,1.46,0.52,6.20,1.69, + 0.08,3.67,2.81,3.49] + w,pw = stats.shapiro(x2) + assert_almost_equal(w,0.9590270,6) + assert_almost_equal(pw,0.52460,3) + + def test_bad_arg(self): + # Length of x is less than 3. + x = [1] + assert_raises(ValueError, stats.shapiro, x) + + +class TestAnderson(TestCase): + def test_normal(self): + rs = RandomState(1234567890) + x1 = rs.standard_exponential(size=50) + x2 = rs.standard_normal(size=50) + A,crit,sig = stats.anderson(x1) + assert_array_less(crit[:-1], A) + A,crit,sig = stats.anderson(x2) + assert_array_less(A, crit[-2:]) + + def test_expon(self): + rs = RandomState(1234567890) + x1 = rs.standard_exponential(size=50) + x2 = rs.standard_normal(size=50) + A,crit,sig = stats.anderson(x1,'expon') + assert_array_less(A, crit[-2:]) + olderr = np.seterr(all='ignore') + try: + A,crit,sig = stats.anderson(x2,'expon') + finally: + np.seterr(**olderr) + assert_(A > crit[-1]) + + def test_bad_arg(self): + assert_raises(ValueError, stats.anderson, [1], dist='plate_of_shrimp') + + +class TestAnsari(TestCase): + + def test_small(self): + x = [1,2,3,3,4] + y = [3,2,6,1,6,1,4,1] + W, pval = stats.ansari(x,y) + assert_almost_equal(W,23.5,11) + assert_almost_equal(pval,0.13499256881897437,11) + + def test_approx(self): + ramsay = np.array((111, 107, 100, 99, 102, 106, 109, 108, 104, 99, + 101, 96, 97, 102, 107, 113, 116, 113, 110, 98)) + parekh = np.array((107, 108, 106, 98, 105, 103, 110, 105, 104, + 100, 96, 108, 103, 104, 114, 114, 113, 108, 106, 99)) + + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', + message="Ties preclude use of exact statistic.") + W, pval = stats.ansari(ramsay, parekh) + + assert_almost_equal(W,185.5,11) + assert_almost_equal(pval,0.18145819972867083,11) + + def test_exact(self): + W,pval = stats.ansari([1,2,3,4],[15,5,20,8,10,12]) + assert_almost_equal(W,10.0,11) + assert_almost_equal(pval,0.533333333333333333,7) + + def test_bad_arg(self): + assert_raises(ValueError, stats.ansari, [], [1]) + assert_raises(ValueError, stats.ansari, [1], []) + + +class TestBartlett(TestCase): + + def test_data(self): + args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10] + T, pval = stats.bartlett(*args) + assert_almost_equal(T,20.78587342806484,7) + assert_almost_equal(pval,0.0136358632781,7) + + def test_bad_arg(self): + # Too few args raises ValueError. + assert_raises(ValueError, stats.bartlett, [1]) + + +class TestLevene(TestCase): + + def test_data(self): + args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10] + W, pval = stats.levene(*args) + assert_almost_equal(W,1.7059176930008939,7) + assert_almost_equal(pval,0.0990829755522,7) + + def test_trimmed1(self): + # Test that center='trimmed' gives the same result as center='mean' + # when proportiontocut=0. + W1, pval1 = stats.levene(g1, g2, g3, center='mean') + W2, pval2 = stats.levene(g1, g2, g3, center='trimmed', proportiontocut=0.0) + assert_almost_equal(W1, W2) + assert_almost_equal(pval1, pval2) + + def test_trimmed2(self): + x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0] + y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0] + np.random.seed(1234) + x2 = np.random.permutation(x) + + # Use center='trimmed' + W0, pval0 = stats.levene(x, y, center='trimmed', proportiontocut=0.125) + W1, pval1 = stats.levene(x2, y, center='trimmed', proportiontocut=0.125) + # Trim the data here, and use center='mean' + W2, pval2 = stats.levene(x[1:-1], y[1:-1], center='mean') + # Result should be the same. + assert_almost_equal(W0, W2) + assert_almost_equal(W1, W2) + assert_almost_equal(pval1, pval2) + + def test_equal_mean_median(self): + x = np.linspace(-1,1,21) + np.random.seed(1234) + x2 = np.random.permutation(x) + y = x**3 + W1, pval1 = stats.levene(x, y, center='mean') + W2, pval2 = stats.levene(x2, y, center='median') + assert_almost_equal(W1, W2) + assert_almost_equal(pval1, pval2) + + def test_bad_keyword(self): + x = np.linspace(-1,1,21) + assert_raises(TypeError, stats.levene, x, x, portiontocut=0.1) + + def test_bad_center_value(self): + x = np.linspace(-1,1,21) + assert_raises(ValueError, stats.levene, x, x, center='trim') + + def test_too_few_args(self): + assert_raises(ValueError, stats.levene, [1]) + + +class TestBinomP(TestCase): + + def test_data(self): + pval = stats.binom_test(100,250) + assert_almost_equal(pval,0.0018833009350757682,11) + pval = stats.binom_test(201,405) + assert_almost_equal(pval,0.92085205962670713,11) + pval = stats.binom_test([682,243],p=3.0/4) + assert_almost_equal(pval,0.38249155957481695,11) + + def test_bad_len_x(self): + # Length of x must be 1 or 2. + assert_raises(ValueError, stats.binom_test, [1,2,3]) + + def test_bad_n(self): + # len(x) is 1, but n is invalid. + # Missing n + assert_raises(ValueError, stats.binom_test, [100]) + # n less than x[0] + assert_raises(ValueError, stats.binom_test, [100], n=50) + + def test_bad_p(self): + assert_raises(ValueError, stats.binom_test, [50, 50], p=2.0) + + +class TestFindRepeats(TestCase): + + def test_basic(self): + a = [1,2,3,4,1,2,3,4,1,2,5] + res,nums = stats.find_repeats(a) + assert_array_equal(res,[1,2,3,4]) + assert_array_equal(nums,[3,3,2,2]) + + def test_empty_result(self): + # Check that empty arrays are returned when there are no repeats. + a = [10, 20, 50, 30, 40] + repeated, counts = stats.find_repeats(a) + assert_array_equal(repeated, []) + assert_array_equal(counts, []) + + +class TestFligner(TestCase): + + def test_data(self): + # numbers from R: fligner.test in package stats + x1 = np.arange(5) + assert_array_almost_equal(stats.fligner(x1,x1**2), + (3.2282229927203536, 0.072379187848207877), 11) + + def test_trimmed1(self): + # Test that center='trimmed' gives the same result as center='mean' + # when proportiontocut=0. + Xsq1, pval1 = stats.fligner(g1, g2, g3, center='mean') + Xsq2, pval2 = stats.fligner(g1, g2, g3, center='trimmed', proportiontocut=0.0) + assert_almost_equal(Xsq1, Xsq2) + assert_almost_equal(pval1, pval2) + + def test_trimmed2(self): + x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0] + y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0] + # Use center='trimmed' + Xsq1, pval1 = stats.fligner(x, y, center='trimmed', proportiontocut=0.125) + # Trim the data here, and use center='mean' + Xsq2, pval2 = stats.fligner(x[1:-1], y[1:-1], center='mean') + # Result should be the same. + assert_almost_equal(Xsq1, Xsq2) + assert_almost_equal(pval1, pval2) + + # The following test looks reasonable at first, but fligner() uses the + # function stats.rankdata(), and in one of the cases in this test, + # there are ties, while in the other (because of normal rounding + # errors) there are not. This difference leads to differences in the + # third significant digit of W. + # + #def test_equal_mean_median(self): + # x = np.linspace(-1,1,21) + # y = x**3 + # W1, pval1 = stats.fligner(x, y, center='mean') + # W2, pval2 = stats.fligner(x, y, center='median') + # assert_almost_equal(W1, W2) + # assert_almost_equal(pval1, pval2) + + def test_bad_keyword(self): + x = np.linspace(-1,1,21) + assert_raises(TypeError, stats.fligner, x, x, portiontocut=0.1) + + def test_bad_center_value(self): + x = np.linspace(-1,1,21) + assert_raises(ValueError, stats.fligner, x, x, center='trim') + + def test_bad_num_args(self): + # Too few args raises ValueError. + assert_raises(ValueError, stats.fligner, [1]) + + +class TestMood(TestCase): + def test_mood(self): + # numbers from R: mood.test in package stats + x1 = np.arange(5) + assert_array_almost_equal(stats.mood(x1, x1**2), + (-1.3830857299399906, 0.16663858066771478), 11) + + def test_mood_order_of_args(self): + # z should change sign when the order of arguments changes, pvalue + # should not change + np.random.seed(1234) + x1 = np.random.randn(10, 1) + x2 = np.random.randn(15, 1) + z1, p1 = stats.mood(x1, x2) + z2, p2 = stats.mood(x2, x1) + assert_array_almost_equal([z1, p1], [-z2, p2]) + + def test_mood_with_axis_none(self): + #Test with axis = None, compare with results from R + x1 = [-0.626453810742332, 0.183643324222082, -0.835628612410047, + 1.59528080213779, 0.329507771815361, -0.820468384118015, + 0.487429052428485, 0.738324705129217, 0.575781351653492, + -0.305388387156356, 1.51178116845085, 0.389843236411431, + -0.621240580541804, -2.2146998871775, 1.12493091814311, + -0.0449336090152309, -0.0161902630989461, 0.943836210685299, + 0.821221195098089, 0.593901321217509] + + x2 = [-0.896914546624981, 0.184849184646742, 1.58784533120882, + -1.13037567424629, -0.0802517565509893, 0.132420284381094, + 0.707954729271733, -0.23969802417184, 1.98447393665293, + -0.138787012119665, 0.417650750792556, 0.981752777463662, + -0.392695355503813, -1.03966897694891, 1.78222896030858, + -2.31106908460517, 0.878604580921265, 0.035806718015226, + 1.01282869212708, 0.432265154539617, 2.09081920524915, + -1.19992581964387, 1.58963820029007, 1.95465164222325, + 0.00493777682814261, -2.45170638784613, 0.477237302613617, + -0.596558168631403, 0.792203270299649, 0.289636710177348] + + x1 = np.array(x1) + x2 = np.array(x2) + x1.shape = (10, 2) + x2.shape = (15, 2) + assert_array_almost_equal(stats.mood(x1, x2, axis=None), + [-1.31716607555, 0.18778296257]) + + def test_mood_2d(self): + # Test if the results of mood test in 2-D case are consistent with the + # R result for the same inputs. Numbers from R mood.test(). + ny = 5 + np.random.seed(1234) + x1 = np.random.randn(10, ny) + x2 = np.random.randn(15, ny) + z_vectest, pval_vectest = stats.mood(x1, x2) + + for j in range(ny): + assert_array_almost_equal([z_vectest[j], pval_vectest[j]], + stats.mood(x1[:, j], x2[:, j])) + + # inverse order of dimensions + x1 = x1.transpose() + x2 = x2.transpose() + z_vectest, pval_vectest = stats.mood(x1, x2, axis=1) + + for i in range(ny): + # check axis handling is self consistent + assert_array_almost_equal([z_vectest[i], pval_vectest[i]], + stats.mood(x1[i, :], x2[i, :])) + + def test_mood_3d(self): + shape = (10, 5, 6) + np.random.seed(1234) + x1 = np.random.randn(*shape) + x2 = np.random.randn(*shape) + + for axis in range(3): + z_vectest, pval_vectest = stats.mood(x1, x2, axis=axis) + # Tests that result for 3-D arrays is equal to that for the + # same calculation on a set of 1-D arrays taken from the + # 3-D array + axes_idx = ([1, 2], [0, 2], [0, 1]) # the two axes != axis + for i in range(shape[axes_idx[axis][0]]): + for j in range(shape[axes_idx[axis][1]]): + if axis == 0: + slice1 = x1[:, i, j] + slice2 = x2[:, i, j] + elif axis == 1: + slice1 = x1[i, :, j] + slice2 = x2[i, :, j] + else: + slice1 = x1[i, j, :] + slice2 = x2[i, j, :] + + assert_array_almost_equal([z_vectest[i, j], + pval_vectest[i, j]], + stats.mood(slice1, slice2)) + + def test_mood_bad_arg(self): + # Raise ValueError when the sum of the lengths of the args is less than 3 + assert_raises(ValueError, stats.mood, [1], []) + + +class TestProbplot(TestCase): + + def test_basic(self): + np.random.seed(12345) + x = stats.norm.rvs(size=20) + osm, osr = stats.probplot(x, fit=False) + osm_expected = [-1.8241636, -1.38768012, -1.11829229, -0.91222575, + -0.73908135, -0.5857176, -0.44506467, -0.31273668, + -0.18568928, -0.06158146, 0.06158146, 0.18568928, + 0.31273668, 0.44506467, 0.5857176, 0.73908135, + 0.91222575, 1.11829229, 1.38768012, 1.8241636] + assert_allclose(osr, np.sort(x)) + assert_allclose(osm, osm_expected) + + res, res_fit = stats.probplot(x, fit=True) + res_fit_expected = [1.05361841, 0.31297795, 0.98741609] + assert_allclose(res_fit, res_fit_expected) + + def test_sparams_keyword(self): + np.random.seed(123456) + x = stats.norm.rvs(size=100) + # Check that None, () and 0 (loc=0, for normal distribution) all work + # and give the same results + osm1, osr1 = stats.probplot(x, sparams=None, fit=False) + osm2, osr2 = stats.probplot(x, sparams=0, fit=False) + osm3, osr3 = stats.probplot(x, sparams=(), fit=False) + assert_allclose(osm1, osm2) + assert_allclose(osm1, osm3) + assert_allclose(osr1, osr2) + assert_allclose(osr1, osr3) + # Check giving (loc, scale) params for normal distribution + osm, osr = stats.probplot(x, sparams=(), fit=False) + + def test_dist_keyword(self): + np.random.seed(12345) + x = stats.norm.rvs(size=20) + osm1, osr1 = stats.probplot(x, fit=False, dist='t', sparams=(3,)) + osm2, osr2 = stats.probplot(x, fit=False, dist=stats.t, sparams=(3,)) + assert_allclose(osm1, osm2) + assert_allclose(osr1, osr2) + + assert_raises(ValueError, stats.probplot, x, dist='wrong-dist-name') + assert_raises(AttributeError, stats.probplot, x, dist=[]) + + class custom_dist(object): + """Some class that looks just enough like a distribution.""" + def ppf(self, q): + return stats.norm.ppf(q, loc=2) + + osm1, osr1 = stats.probplot(x, sparams=(2,), fit=False) + osm2, osr2 = stats.probplot(x, dist=custom_dist(), fit=False) + assert_allclose(osm1, osm2) + assert_allclose(osr1, osr2) + + @dec.skipif(not have_matplotlib) + def test_plot_kwarg(self): + np.random.seed(7654321) + fig = plt.figure() + fig.add_subplot(111) + x = stats.t.rvs(3, size=100) + res1, fitres1 = stats.probplot(x, plot=plt) + plt.close() + res2, fitres2 = stats.probplot(x, plot=None) + res3 = stats.probplot(x, fit=False, plot=plt) + plt.close() + res4 = stats.probplot(x, fit=False, plot=None) + # Check that results are consistent between combinations of `fit` and + # `plot` keywords. + assert_(len(res1) == len(res2) == len(res3) == len(res4) == 2) + assert_allclose(res1, res2) + assert_allclose(res1, res3) + assert_allclose(res1, res4) + assert_allclose(fitres1, fitres2) + + # Check that a Matplotlib Axes object is accepted + fig = plt.figure() + ax = fig.add_subplot(111) + stats.probplot(x, fit=False, plot=ax) + plt.close() + + def test_probplot_bad_args(self): + # Raise ValueError when given an invalid distribution. + assert_raises(ValueError, stats.probplot, [1], dist="plate_of_shrimp") + + +def test_wilcoxon_bad_arg(): + # Raise ValueError when two args of different lengths are given or + # zero_method is unknown. + assert_raises(ValueError, stats.wilcoxon, [1], [1,2]) + assert_raises(ValueError, stats.wilcoxon, [1,2], [1,2], "dummy") + + +def test_mvsdist_bad_arg(): + # Raise ValueError if fewer than two data points are given. + data = [1] + assert_raises(ValueError, stats.mvsdist, data) + + +def test_kstat_bad_arg(): + # Raise ValueError if n > 4 or n > 1. + data = [1] + n = 10 + assert_raises(ValueError, stats.kstat, data, n=n) + + +def test_kstatvar_bad_arg(): + # Raise ValueError is n is not 1 or 2. + data = [1] + n = 10 + assert_raises(ValueError, stats.kstatvar, data, n=n) + + +def test_ppcc_max_bad_arg(): + # Raise ValueError when given an invalid distribution. + data = [1] + assert_raises(ValueError, stats.ppcc_max, data, dist="plate_of_shrimp") + + +class TestBoxcox_llf(TestCase): + + def test_basic(self): + np.random.seed(54321) + x = stats.norm.rvs(size=10000, loc=10) + lmbda = 1 + llf = stats.boxcox_llf(lmbda, x) + llf_expected = -x.size / 2. * np.log(np.sum(x.std()**2)) + assert_allclose(llf, llf_expected) + + def test_array_like(self): + np.random.seed(54321) + x = stats.norm.rvs(size=100, loc=10) + lmbda = 1 + llf = stats.boxcox_llf(lmbda, x) + llf2 = stats.boxcox_llf(lmbda, list(x)) + assert_allclose(llf, llf2, rtol=1e-12) + + def test_2d_input(self): + # Note: boxcox_llf() was already working with 2-D input (sort of), so + # keep it like that. boxcox() doesn't work with 2-D input though, due + # to brent() returning a scalar. + np.random.seed(54321) + x = stats.norm.rvs(size=100, loc=10) + lmbda = 1 + llf = stats.boxcox_llf(lmbda, x) + llf2 = stats.boxcox_llf(lmbda, np.vstack([x, x]).T) + assert_allclose([llf, llf], llf2, rtol=1e-12) + + def test_empty(self): + assert_(np.isnan(stats.boxcox_llf(1, []))) + + +class TestBoxcox(TestCase): + + def test_fixed_lmbda(self): + np.random.seed(12345) + x = stats.loggamma.rvs(5, size=50) + 5 + xt = stats.boxcox(x, lmbda=1) + assert_allclose(xt, x - 1) + xt = stats.boxcox(x, lmbda=-1) + assert_allclose(xt, 1 - 1/x) + + xt = stats.boxcox(x, lmbda=0) + assert_allclose(xt, np.log(x)) + + # Also test that array_like input works + xt = stats.boxcox(list(x), lmbda=0) + assert_allclose(xt, np.log(x)) + + def test_lmbda_None(self): + np.random.seed(1234567) + # Start from normal rv's, do inverse transform to check that + # optimization function gets close to the right answer. + np.random.seed(1245) + lmbda = 2.5 + x = stats.norm.rvs(loc=10, size=50000) + x_inv = (x * lmbda + 1)**(-lmbda) + xt, maxlog = stats.boxcox(x_inv) + + assert_almost_equal(maxlog, -1 / lmbda, decimal=2) + + def test_alpha(self): + np.random.seed(1234) + x = stats.loggamma.rvs(5, size=50) + 5 + + # Some regular values for alpha, on a small sample size + _, _, interval = stats.boxcox(x, alpha=0.75) + assert_allclose(interval, [4.004485780226041, 5.138756355035744]) + _, _, interval = stats.boxcox(x, alpha=0.05) + assert_allclose(interval, [1.2138178554857557, 8.209033272375663]) + + # Try some extreme values, see we don't hit the N=500 limit + x = stats.loggamma.rvs(7, size=500) + 15 + _, _, interval = stats.boxcox(x, alpha=0.001) + assert_allclose(interval, [0.3988867, 11.40553131]) + _, _, interval = stats.boxcox(x, alpha=0.999) + assert_allclose(interval, [5.83316246, 5.83735292]) + + def test_boxcox_bad_arg(self): + # Raise ValueError if any data value is negative. + x = np.array([-1]) + assert_raises(ValueError, stats.boxcox, x) + + def test_empty(self): + assert_(stats.boxcox([]).shape == (0,)) + + +class TestBoxcoxNormmax(TestCase): + def setUp(self): + np.random.seed(12345) + self.x = stats.loggamma.rvs(5, size=50) + 5 + + def test_pearsonr(self): + maxlog = stats.boxcox_normmax(self.x) + assert_allclose(maxlog, 1.804465325046) + + def test_mle(self): + maxlog = stats.boxcox_normmax(self.x, method='mle') + assert_allclose(maxlog, 1.758101454114) + + # Check that boxcox() uses 'mle' + _, maxlog_boxcox = stats.boxcox(self.x) + assert_allclose(maxlog_boxcox, maxlog) + + def test_all(self): + maxlog_all = stats.boxcox_normmax(self.x, method='all') + assert_allclose(maxlog_all, [1.804465325046, 1.758101454114]) + + +class TestBoxcoxNormplot(TestCase): + def setUp(self): + np.random.seed(7654321) + self.x = stats.loggamma.rvs(5, size=500) + 5 + + def test_basic(self): + N = 5 + lmbdas, ppcc = stats.boxcox_normplot(self.x, -10, 10, N=N) + ppcc_expected = [0.57783375, 0.83610988, 0.97524311, 0.99756057, + 0.95843297] + assert_allclose(lmbdas, np.linspace(-10, 10, num=N)) + assert_allclose(ppcc, ppcc_expected) + + @dec.skipif(not have_matplotlib) + def test_plot_kwarg(self): + # Check with the matplotlib.pyplot module + fig = plt.figure() + fig.add_subplot(111) + stats.boxcox_normplot(self.x, -20, 20, plot=plt) + plt.close() + + # Check that a Matplotlib Axes object is accepted + fig.add_subplot(111) + ax = fig.add_subplot(111) + stats.boxcox_normplot(self.x, -20, 20, plot=ax) + plt.close() + + def test_invalid_inputs(self): + # `lb` has to be larger than `la` + assert_raises(ValueError, stats.boxcox_normplot, self.x, 1, 0) + # `x` can not contain negative values + assert_raises(ValueError, stats.boxcox_normplot, [-1, 1] , 0, 1) + + def test_empty(self): + assert_(stats.boxcox_normplot([], 0, 1).size == 0) + + +class TestCircFuncs(TestCase): + def test_circfuncs(self): + x = np.array([355,5,2,359,10,350]) + M = stats.circmean(x, high=360) + Mval = 0.167690146 + assert_allclose(M, Mval, rtol=1e-7) + + V = stats.circvar(x, high=360) + Vval = 42.51955609 + assert_allclose(V, Vval, rtol=1e-7) + + S = stats.circstd(x, high=360) + Sval = 6.520702116 + assert_allclose(S, Sval, rtol=1e-7) + + def test_circfuncs_small(self): + x = np.array([20,21,22,18,19,20.5,19.2]) + M1 = x.mean() + M2 = stats.circmean(x, high=360) + assert_allclose(M2, M1, rtol=1e-5) + + V1 = x.var() + V2 = stats.circvar(x, high=360) + assert_allclose(V2, V1, rtol=1e-4) + + S1 = x.std() + S2 = stats.circstd(x, high=360) + assert_allclose(S2, S1, rtol=1e-4) + + def test_circmean_axis(self): + x = np.array([[355,5,2,359,10,350], + [351,7,4,352,9,349], + [357,9,8,358,4,356]]) + M1 = stats.circmean(x, high=360) + M2 = stats.circmean(x.ravel(), high=360) + assert_allclose(M1, M2, rtol=1e-14) + + M1 = stats.circmean(x, high=360, axis=1) + M2 = [stats.circmean(x[i], high=360) for i in range(x.shape[0])] + assert_allclose(M1, M2, rtol=1e-14) + + M1 = stats.circmean(x, high=360, axis=0) + M2 = [stats.circmean(x[:,i], high=360) for i in range(x.shape[1])] + assert_allclose(M1, M2, rtol=1e-14) + + def test_circvar_axis(self): + x = np.array([[355,5,2,359,10,350], + [351,7,4,352,9,349], + [357,9,8,358,4,356]]) + + V1 = stats.circvar(x, high=360) + V2 = stats.circvar(x.ravel(), high=360) + assert_allclose(V1, V2, rtol=1e-11) + + V1 = stats.circvar(x, high=360, axis=1) + V2 = [stats.circvar(x[i], high=360) for i in range(x.shape[0])] + assert_allclose(V1, V2, rtol=1e-11) + + V1 = stats.circvar(x, high=360, axis=0) + V2 = [stats.circvar(x[:,i], high=360) for i in range(x.shape[1])] + assert_allclose(V1, V2, rtol=1e-11) + + def test_circstd_axis(self): + x = np.array([[355,5,2,359,10,350], + [351,7,4,352,9,349], + [357,9,8,358,4,356]]) + + S1 = stats.circstd(x, high=360) + S2 = stats.circstd(x.ravel(), high=360) + assert_allclose(S1, S2, rtol=1e-11) + + S1 = stats.circstd(x, high=360, axis=1) + S2 = [stats.circstd(x[i], high=360) for i in range(x.shape[0])] + assert_allclose(S1, S2, rtol=1e-11) + + S1 = stats.circstd(x, high=360, axis=0) + S2 = [stats.circstd(x[:,i], high=360) for i in range(x.shape[1])] + assert_allclose(S1, S2, rtol=1e-11) + + def test_circfuncs_array_like(self): + x = [355,5,2,359,10,350] + assert_allclose(stats.circmean(x, high=360), 0.167690146, rtol=1e-7) + assert_allclose(stats.circvar(x, high=360), 42.51955609, rtol=1e-7) + assert_allclose(stats.circstd(x, high=360), 6.520702116, rtol=1e-7) + + def test_empty(self): + assert_(np.isnan(stats.circmean([]))) + assert_(np.isnan(stats.circstd([]))) + assert_(np.isnan(stats.circvar([]))) + + +def test_accuracy_wilcoxon(): + freq = [1, 4, 16, 15, 8, 4, 5, 1, 2] + nums = range(-4, 5) + x = np.concatenate([[u] * v for u, v in zip(nums, freq)]) + y = np.zeros(x.size) + + T, p = stats.wilcoxon(x, y, "pratt") + assert_allclose(T, 423) + assert_allclose(p, 0.00197547303533107) + + T, p = stats.wilcoxon(x, y, "zsplit") + assert_allclose(T, 441) + assert_allclose(p, 0.0032145343172473055) + + T, p = stats.wilcoxon(x, y, "wilcox") + assert_allclose(T, 327) + assert_allclose(p, 0.00641346115861) + + # Test the 'correction' option, using values computed in R with: + # > wilcox.test(x, y, paired=TRUE, exact=FALSE, correct={FALSE,TRUE}) + x = np.array([120, 114, 181, 188, 180, 146, 121, 191, 132, 113, 127, 112]) + y = np.array([133, 143, 119, 189, 112, 199, 198, 113, 115, 121, 142, 187]) + T, p = stats.wilcoxon(x, y, correction=False) + assert_equal(T, 34) + assert_allclose(p, 0.6948866, rtol=1e-6) + T, p = stats.wilcoxon(x, y, correction=True) + assert_equal(T, 34) + assert_allclose(p, 0.7240817, rtol=1e-6) + + +def test_wilcoxon_tie(): + # Regression test for gh-2391. + # Corresponding R code is: + # > result = wilcox.test(rep(0.1, 10), exact=FALSE, correct=FALSE) + # > result$p.value + # [1] 0.001565402 + # > result = wilcox.test(rep(0.1, 10), exact=FALSE, correct=TRUE) + # > result$p.value + # [1] 0.001904195 + stat, p = stats.wilcoxon([0.1] * 10) + expected_p = 0.001565402 + assert_equal(stat, 0) + assert_allclose(p, expected_p, rtol=1e-6) + + stat, p = stats.wilcoxon([0.1] * 10, correction=True) + expected_p = 0.001904195 + assert_equal(stat, 0) + assert_allclose(p, expected_p, rtol=1e-6) + + +if __name__ == "__main__": + run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_mstats_basic.py b/pywafo/src/wafo/stats/tests/test_mstats_basic.py index ca45e8b..39057f8 100644 --- a/pywafo/src/wafo/stats/tests/test_mstats_basic.py +++ b/pywafo/src/wafo/stats/tests/test_mstats_basic.py @@ -1,490 +1,657 @@ -""" -Tests for the stats.mstats module (support for maskd arrays) -""" - - -import numpy as np -from numpy import nan -import numpy.ma as ma -from numpy.ma import masked, nomask - -import scipy.stats.mstats as mstats -from numpy.testing import TestCase, run_module_suite -from numpy.ma.testutils import assert_equal, assert_almost_equal, \ - assert_array_almost_equal, assert_ - - -class TestMquantiles(TestCase): - """Regression tests for mstats module.""" - def test_mquantiles_limit_keyword(self): - """Ticket #867""" - data = np.array([[ 6., 7., 1.], - [ 47., 15., 2.], - [ 49., 36., 3.], - [ 15., 39., 4.], - [ 42., 40., -999.], - [ 41., 41., -999.], - [ 7., -999., -999.], - [ 39., -999., -999.], - [ 43., -999., -999.], - [ 40., -999., -999.], - [ 36., -999., -999.]]) - desired = [[19.2, 14.6, 1.45], - [40.0, 37.5, 2.5 ], - [42.8, 40.05, 3.55]] - quants = mstats.mquantiles(data, axis=0, limit=(0, 50)) - assert_almost_equal(quants, desired) - - - -class TestGMean(TestCase): - def test_1D(self): - a = (1,2,3,4) - actual= mstats.gmean(a) - desired = np.power(1*2*3*4,1./4.) - assert_almost_equal(actual, desired,decimal=14) - - desired1 = mstats.gmean(a,axis=-1) - assert_almost_equal(actual, desired1, decimal=14) - assert_(not isinstance(desired1, ma.MaskedArray)) - # - a = ma.array((1,2,3,4),mask=(0,0,0,1)) - actual= mstats.gmean(a) - desired = np.power(1*2*3,1./3.) - assert_almost_equal(actual, desired,decimal=14) - - desired1 = mstats.gmean(a,axis=-1) - assert_almost_equal(actual, desired1, decimal=14) - # - def test_2D(self): - a = ma.array(((1,2,3,4),(1,2,3,4),(1,2,3,4)), - mask=((0,0,0,0),(1,0,0,1),(0,1,1,0))) - actual= mstats.gmean(a) - desired = np.array((1,2,3,4)) - assert_array_almost_equal(actual, desired, decimal=14) - # - desired1 = mstats.gmean(a,axis=0) - assert_array_almost_equal(actual, desired1, decimal=14) - # - actual= mstats.gmean(a, -1) - desired = ma.array((np.power(1*2*3*4,1./4.), - np.power(2*3,1./2.), - np.power(1*4,1./2.))) - assert_array_almost_equal(actual, desired, decimal=14) - -class TestHMean(TestCase): - def test_1D(self): - a = (1,2,3,4) - actual= mstats.hmean(a) - desired = 4. / (1./1 + 1./2 + 1./3 + 1./4) - assert_almost_equal(actual, desired, decimal=14) - desired1 = mstats.hmean(ma.array(a),axis=-1) - assert_almost_equal(actual, desired1, decimal=14) - # - a = ma.array((1,2,3,4),mask=(0,0,0,1)) - actual= mstats.hmean(a) - desired = 3. / (1./1 + 1./2 + 1./3) - assert_almost_equal(actual, desired,decimal=14) - desired1 = mstats.hmean(a,axis=-1) - assert_almost_equal(actual, desired1, decimal=14) - - def test_2D(self): - a = ma.array(((1,2,3,4),(1,2,3,4),(1,2,3,4)), - mask=((0,0,0,0),(1,0,0,1),(0,1,1,0))) - actual= mstats.hmean(a) - desired = ma.array((1,2,3,4)) - assert_array_almost_equal(actual, desired, decimal=14) - # - actual1 = mstats.hmean(a,axis=-1) - desired = (4./(1/1.+1/2.+1/3.+1/4.), - 2./(1/2.+1/3.), - 2./(1/1.+1/4.) - ) - assert_array_almost_equal(actual1, desired, decimal=14) - - -class TestRanking(TestCase): - # - def __init__(self, *args, **kwargs): - TestCase.__init__(self, *args, **kwargs) - # - def test_ranking(self): - x = ma.array([0,1,1,1,2,3,4,5,5,6,]) - assert_almost_equal(mstats.rankdata(x),[1,3,3,3,5,6,7,8.5,8.5,10]) - x[[3,4]] = masked - assert_almost_equal(mstats.rankdata(x),[1,2.5,2.5,0,0,4,5,6.5,6.5,8]) - assert_almost_equal(mstats.rankdata(x,use_missing=True), - [1,2.5,2.5,4.5,4.5,4,5,6.5,6.5,8]) - x = ma.array([0,1,5,1,2,4,3,5,1,6,]) - assert_almost_equal(mstats.rankdata(x),[1,3,8.5,3,5,7,6,8.5,3,10]) - x = ma.array([[0,1,1,1,2], [3,4,5,5,6,]]) - assert_almost_equal(mstats.rankdata(x),[[1,3,3,3,5],[6,7,8.5,8.5,10]]) - assert_almost_equal(mstats.rankdata(x,axis=1),[[1,3,3,3,5],[1,2,3.5,3.5,5]]) - assert_almost_equal(mstats.rankdata(x,axis=0),[[1,1,1,1,1],[2,2,2,2,2,]]) - - -class TestCorr(TestCase): - # - def test_pearsonr(self): - "Tests some computations of Pearson's r" - x = ma.arange(10) - olderr = np.seterr(all='ignore') - try: - assert_almost_equal(mstats.pearsonr(x,x)[0], 1.0) - assert_almost_equal(mstats.pearsonr(x,x[::-1])[0], -1.0) - - x = ma.array(x, mask=True) - pr = mstats.pearsonr(x,x) - finally: - np.seterr(**olderr) - assert_(pr[0] is masked) - assert_(pr[1] is masked) - # - def test_spearmanr(self): - "Tests some computations of Spearman's rho" - (x, y) = ([5.05,6.75,3.21,2.66],[1.65,2.64,2.64,6.95]) - assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555) - (x, y) = ([5.05,6.75,3.21,2.66,np.nan],[1.65,2.64,2.64,6.95,np.nan]) - (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) - assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555) - # - x = [ 2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, - 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7] - y = [22.6, 08.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, - 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4] - assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299) - x = [ 2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, - 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7, np.nan] - y = [22.6, 08.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, - 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4, np.nan] - (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) - assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299) - # - def test_kendalltau(self): - "Tests some computations of Kendall's tau" - x = ma.fix_invalid([5.05, 6.75, 3.21, 2.66,np.nan]) - y = ma.fix_invalid([1.65, 26.5, -5.93, 7.96, np.nan]) - z = ma.fix_invalid([1.65, 2.64, 2.64, 6.95, np.nan]) - assert_almost_equal(np.asarray(mstats.kendalltau(x,y)), - [+0.3333333,0.4969059]) - assert_almost_equal(np.asarray(mstats.kendalltau(x,z)), - [-0.5477226,0.2785987]) - # - x = ma.fix_invalid([ 0, 0, 0, 0,20,20, 0,60, 0,20, - 10,10, 0,40, 0,20, 0, 0, 0, 0, 0, np.nan]) - y = ma.fix_invalid([ 0,80,80,80,10,33,60, 0,67,27, - 25,80,80,80,80,80,80, 0,10,45, np.nan, 0]) - result = mstats.kendalltau(x,y) - assert_almost_equal(np.asarray(result), [-0.1585188, 0.4128009]) - # - def test_kendalltau_seasonal(self): - "Tests the seasonal Kendall tau." - x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1], - [ 4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3], - [ 3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan], - [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]] - x = ma.fix_invalid(x).T - output = mstats.kendalltau_seasonal(x) - assert_almost_equal(output['global p-value (indep)'], 0.008, 3) - assert_almost_equal(output['seasonal p-value'].round(2), - [0.18,0.53,0.20,0.04]) - # - def test_pointbiserial(self): - "Tests point biserial" - x = [1,0,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0, - 0,0,0,0,1,-1] - y = [14.8,13.8,12.4,10.1,7.1,6.1,5.8,4.6,4.3,3.5,3.3,3.2,3.0, - 2.8,2.8,2.5,2.4,2.3,2.1,1.7,1.7,1.5,1.3,1.3,1.2,1.2,1.1, - 0.8,0.7,0.6,0.5,0.2,0.2,0.1,np.nan] - assert_almost_equal(mstats.pointbiserialr(x, y)[0], 0.36149, 5) - - -class TestTrimming(TestCase): - # - def test_trim(self): - "Tests trimming" - a = ma.arange(10) - assert_equal(mstats.trim(a), [0,1,2,3,4,5,6,7,8,9]) - a = ma.arange(10) - assert_equal(mstats.trim(a,(2,8)), [None,None,2,3,4,5,6,7,8,None]) - a = ma.arange(10) - assert_equal(mstats.trim(a,limits=(2,8),inclusive=(False,False)), - [None,None,None,3,4,5,6,7,None,None]) - a = ma.arange(10) - assert_equal(mstats.trim(a,limits=(0.1,0.2),relative=True), - [None,1,2,3,4,5,6,7,None,None]) - # - a = ma.arange(12) - a[[0,-1]] = a[5] = masked - assert_equal(mstats.trim(a,(2,8)), - [None,None,2,3,4,None,6,7,8,None,None,None]) - # - x = ma.arange(100).reshape(10,10) - trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=None) - assert_equal(trimx._mask.ravel(),[1]*10+[0]*70+[1]*20) - trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=0) - assert_equal(trimx._mask.ravel(),[1]*10+[0]*70+[1]*20) - trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=-1) - assert_equal(trimx._mask.T.ravel(),[1]*10+[0]*70+[1]*20) - # - x = ma.arange(110).reshape(11,10) - x[1] = masked - trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=None) - assert_equal(trimx._mask.ravel(),[1]*20+[0]*70+[1]*20) - trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=0) - assert_equal(trimx._mask.ravel(),[1]*20+[0]*70+[1]*20) - trimx = mstats.trim(x.T,(0.1,0.2),relative=True,axis=-1) - assert_equal(trimx.T._mask.ravel(),[1]*20+[0]*70+[1]*20) - # - def test_trim_old(self): - "Tests trimming." - x = ma.arange(100) - assert_equal(mstats.trimboth(x).count(), 60) - assert_equal(mstats.trimtail(x,tail='r').count(), 80) - x[50:70] = masked - trimx = mstats.trimboth(x) - assert_equal(trimx.count(), 48) - assert_equal(trimx._mask, [1]*16 + [0]*34 + [1]*20 + [0]*14 + [1]*16) - x._mask = nomask - x.shape = (10,10) - assert_equal(mstats.trimboth(x).count(), 60) - assert_equal(mstats.trimtail(x).count(), 80) - # - def test_trimmedmean(self): - "Tests the trimmed mean." - data = ma.array([ 77, 87, 88,114,151,210,219,246,253,262, - 296,299,306,376,428,515,666,1310,2611]) - assert_almost_equal(mstats.trimmed_mean(data,0.1), 343, 0) - assert_almost_equal(mstats.trimmed_mean(data,(0.1,0.1)), 343, 0) - assert_almost_equal(mstats.trimmed_mean(data,(0.2,0.2)), 283, 0) - # - def test_trimmed_stde(self): - "Tests the trimmed mean standard error." - data = ma.array([ 77, 87, 88,114,151,210,219,246,253,262, - 296,299,306,376,428,515,666,1310,2611]) - assert_almost_equal(mstats.trimmed_stde(data,(0.2,0.2)), 56.13193, 5) - assert_almost_equal(mstats.trimmed_stde(data,0.2), 56.13193, 5) - # - def test_winsorization(self): - "Tests the Winsorization of the data." - data = ma.array([ 77, 87, 88,114,151,210,219,246,253,262, - 296,299,306,376,428,515,666,1310,2611]) - assert_almost_equal(mstats.winsorize(data,(0.2,0.2)).var(ddof=1), - 21551.4, 1) - data[5] = masked - winsorized = mstats.winsorize(data) - assert_equal(winsorized.mask, data.mask) - - -class TestMoments(TestCase): - """ - Comparison numbers are found using R v.1.5.1 - note that length(testcase) = 4 - testmathworks comes from documentation for the - Statistics Toolbox for Matlab and can be found at both - http://www.mathworks.com/access/helpdesk/help/toolbox/stats/kurtosis.shtml - http://www.mathworks.com/access/helpdesk/help/toolbox/stats/skewness.shtml - Note that both test cases came from here. - """ - testcase = [1,2,3,4] - testmathworks = ma.fix_invalid([1.165 , 0.6268, 0.0751, 0.3516, -0.6965, - np.nan]) - def test_moment(self): - """ - mean((testcase-mean(testcase))**power,axis=0),axis=0))**power))""" - y = mstats.moment(self.testcase,1) - assert_almost_equal(y,0.0,10) - y = mstats.moment(self.testcase,2) - assert_almost_equal(y,1.25) - y = mstats.moment(self.testcase,3) - assert_almost_equal(y,0.0) - y = mstats.moment(self.testcase,4) - assert_almost_equal(y,2.5625) - def test_variation(self): - """variation = samplestd/mean """ -## y = stats.variation(self.shoes[0]) -## assert_almost_equal(y,21.8770668) - y = mstats.variation(self.testcase) - assert_almost_equal(y,0.44721359549996, 10) - - def test_skewness(self): - """ - sum((testmathworks-mean(testmathworks,axis=0))**3,axis=0)/((sqrt(var(testmathworks)*4/5))**3)/5 - """ - y = mstats.skew(self.testmathworks) - assert_almost_equal(y,-0.29322304336607,10) - y = mstats.skew(self.testmathworks,bias=0) - assert_almost_equal(y,-0.437111105023940,10) - y = mstats.skew(self.testcase) - assert_almost_equal(y,0.0,10) - - def test_kurtosis(self): - """ - sum((testcase-mean(testcase,axis=0))**4,axis=0)/((sqrt(var(testcase)*3/4))**4)/4 - sum((test2-mean(testmathworks,axis=0))**4,axis=0)/((sqrt(var(testmathworks)*4/5))**4)/5 - Set flags for axis = 0 and - fisher=0 (Pearson's definition of kurtosis for compatibility with Matlab) - """ - y = mstats.kurtosis(self.testmathworks,0,fisher=0,bias=1) - assert_almost_equal(y, 2.1658856802973,10) - # Note that MATLAB has confusing docs for the following case - # kurtosis(x,0) gives an unbiased estimate of Pearson's skewness - # kurtosis(x) gives a biased estimate of Fisher's skewness (Pearson-3) - # The MATLAB docs imply that both should give Fisher's - y = mstats.kurtosis(self.testmathworks,fisher=0,bias=0) - assert_almost_equal(y, 3.663542721189047,10) - y = mstats.kurtosis(self.testcase,0,0) - assert_almost_equal(y,1.64) - # - def test_mode(self): - "Tests the mode" - # - a1 = [0,0,0,1,1,1,2,3,3,3,3,4,5,6,7] - a2 = np.reshape(a1, (3,5)) - ma1 = ma.masked_where(ma.array(a1)>2,a1) - ma2 = ma.masked_where(a2>2, a2) - assert_equal(mstats.mode(a1, axis=None), (3,4)) - assert_equal(mstats.mode(ma1, axis=None), (0,3)) - assert_equal(mstats.mode(a2, axis=None), (3,4)) - assert_equal(mstats.mode(ma2, axis=None), (0,3)) - assert_equal(mstats.mode(a2, axis=0), ([[0,0,0,1,1]],[[1,1,1,1,1]])) - assert_equal(mstats.mode(ma2, axis=0), ([[0,0,0,1,1]],[[1,1,1,1,1]])) - assert_equal(mstats.mode(a2, axis=-1), ([[0],[3],[3]], [[3],[3],[1]])) - assert_equal(mstats.mode(ma2, axis=-1), ([[0],[1],[0]], [[3],[1],[0]])) - - -class TestPercentile(TestCase): - def setUp(self): - self.a1 = [3,4,5,10,-3,-5,6] - self.a2 = [3,-6,-2,8,7,4,2,1] - self.a3 = [3.,4,5,10,-3,-5,-6,7.0] - - def test_percentile(self): - x = np.arange(8) * 0.5 - assert_equal(mstats.scoreatpercentile(x, 0), 0.) - assert_equal(mstats.scoreatpercentile(x, 100), 3.5) - assert_equal(mstats.scoreatpercentile(x, 50), 1.75) - - def test_2D(self): - x = ma.array([[1, 1, 1], - [1, 1, 1], - [4, 4, 3], - [1, 1, 1], - [1, 1, 1]]) - assert_equal(mstats.scoreatpercentile(x,50), [1,1,1]) - - -class TestVariability(TestCase): - """ Comparison numbers are found using R v.1.5.1 - note that length(testcase) = 4 - """ - testcase = ma.fix_invalid([1,2,3,4,np.nan]) - - def test_signaltonoise(self): - """ - this is not in R, so used - mean(testcase,axis=0)/(sqrt(var(testcase)*3/4)) """ - #y = stats.signaltonoise(self.shoes[0]) - #assert_approx_equal(y,4.5709967) - y = mstats.signaltonoise(self.testcase) - assert_almost_equal(y,2.236067977) - - def test_sem(self): - """ - this is not in R, so used - sqrt(var(testcase)*3/4)/sqrt(3) - """ - #y = stats.sem(self.shoes[0]) - #assert_approx_equal(y,0.775177399) - y = mstats.sem(self.testcase) - assert_almost_equal(y,0.6454972244) - - def test_zmap(self): - """ - not in R, so tested by using - (testcase[i]-mean(testcase,axis=0))/sqrt(var(testcase)*3/4) - """ - y = mstats.zmap(self.testcase, self.testcase) - desired_unmaskedvals = ([-1.3416407864999, -0.44721359549996 , - 0.44721359549996 , 1.3416407864999]) - assert_array_almost_equal(desired_unmaskedvals, - y.data[y.mask==False], decimal=12) - - def test_zscore(self): - """ - not in R, so tested by using - (testcase[i]-mean(testcase,axis=0))/sqrt(var(testcase)*3/4) - """ - y = mstats.zscore(self.testcase) - desired = ma.fix_invalid([-1.3416407864999, -0.44721359549996 , - 0.44721359549996 , 1.3416407864999, np.nan]) - assert_almost_equal(desired, y, decimal=12) - - - -class TestMisc(TestCase): - # - def test_obrientransform(self): - "Tests Obrien transform" - args = [[5]*5+[6]*11+[7]*9+[8]*3+[9]*2+[10]*2, - [6]+[7]*2+[8]*4+[9]*9+[10]*16] - result = [5*[3.1828]+11*[0.5591]+9*[0.0344]+3*[1.6086]+2*[5.2817]+2*[11.0538], - [10.4352]+2*[4.8599]+4*[1.3836]+9*[0.0061]+16*[0.7277]] - assert_almost_equal(np.round(mstats.obrientransform(*args).T,4), - result,4) - # - def test_kstwosamp(self): - "Tests the Kolmogorov-Smirnov 2 samples test" - x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1], - [ 4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3], - [ 3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan], - [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]] - x = ma.fix_invalid(x).T - (winter,spring,summer,fall) = x.T - # - assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring),4), - (0.1818,0.9892)) - assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring,'g'),4), - (0.1469,0.7734)) - assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring,'l'),4), - (0.1818,0.6744)) - # - def test_friedmanchisq(self): - "Tests the Friedman Chi-square test" - # No missing values - args = ([9.0,9.5,5.0,7.5,9.5,7.5,8.0,7.0,8.5,6.0], - [7.0,6.5,7.0,7.5,5.0,8.0,6.0,6.5,7.0,7.0], - [6.0,8.0,4.0,6.0,7.0,6.5,6.0,4.0,6.5,3.0]) - result = mstats.friedmanchisquare(*args) - assert_almost_equal(result[0], 10.4737, 4) - assert_almost_equal(result[1], 0.005317, 6) - # Missing values - x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1], - [ 4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3], - [ 3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan], - [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]] - x = ma.fix_invalid(x) - result = mstats.friedmanchisquare(*x) - assert_almost_equal(result[0], 2.0156, 4) - assert_almost_equal(result[1], 0.5692, 4) - - -def test_regress_simple(): - """Regress a line with sinusoidal noise. Test for #1273.""" - x = np.linspace(0, 100, 100) - y = 0.2 * np.linspace(0, 100, 100) + 10 - y += np.sin(np.linspace(0, 20, 100)) - - slope, intercept, r_value, p_value, sterr = mstats.linregress(x, y) - assert_almost_equal(slope, 0.19644990055858422) - assert_almost_equal(intercept, 10.211269918932341) - - -def test_plotting_positions(): - """Regression test for #1256""" - pos = mstats.plotting_positions(np.arange(3), 0, 0) - assert_array_almost_equal(pos.data, np.array([0.25, 0.5, 0.75])) - - -if __name__ == "__main__": - run_module_suite() +""" +Tests for the stats.mstats module (support for maskd arrays) +""" +from __future__ import division, print_function, absolute_import + +import warnings + +import numpy as np +from numpy import nan +import numpy.ma as ma +from numpy.ma import masked, nomask + +import wafo.stats.mstats as mstats +from wafo import stats +from numpy.testing import TestCase, run_module_suite +from numpy.ma.testutils import (assert_equal, assert_almost_equal, + assert_array_almost_equal, assert_array_almost_equal_nulp, assert_, + assert_allclose, assert_raises) + + +class TestMquantiles(TestCase): + def test_mquantiles_limit_keyword(self): + # Regression test for Trac ticket #867 + data = np.array([[6., 7., 1.], + [47., 15., 2.], + [49., 36., 3.], + [15., 39., 4.], + [42., 40., -999.], + [41., 41., -999.], + [7., -999., -999.], + [39., -999., -999.], + [43., -999., -999.], + [40., -999., -999.], + [36., -999., -999.]]) + desired = [[19.2, 14.6, 1.45], + [40.0, 37.5, 2.5], + [42.8, 40.05, 3.55]] + quants = mstats.mquantiles(data, axis=0, limit=(0, 50)) + assert_almost_equal(quants, desired) + + +class TestGMean(TestCase): + def test_1D(self): + a = (1,2,3,4) + actual = mstats.gmean(a) + desired = np.power(1*2*3*4,1./4.) + assert_almost_equal(actual, desired,decimal=14) + + desired1 = mstats.gmean(a,axis=-1) + assert_almost_equal(actual, desired1, decimal=14) + assert_(not isinstance(desired1, ma.MaskedArray)) + + a = ma.array((1,2,3,4),mask=(0,0,0,1)) + actual = mstats.gmean(a) + desired = np.power(1*2*3,1./3.) + assert_almost_equal(actual, desired,decimal=14) + + desired1 = mstats.gmean(a,axis=-1) + assert_almost_equal(actual, desired1, decimal=14) + + def test_2D(self): + a = ma.array(((1,2,3,4),(1,2,3,4),(1,2,3,4)), + mask=((0,0,0,0),(1,0,0,1),(0,1,1,0))) + actual = mstats.gmean(a) + desired = np.array((1,2,3,4)) + assert_array_almost_equal(actual, desired, decimal=14) + + desired1 = mstats.gmean(a,axis=0) + assert_array_almost_equal(actual, desired1, decimal=14) + + actual = mstats.gmean(a, -1) + desired = ma.array((np.power(1*2*3*4,1./4.), + np.power(2*3,1./2.), + np.power(1*4,1./2.))) + assert_array_almost_equal(actual, desired, decimal=14) + + +class TestHMean(TestCase): + def test_1D(self): + a = (1,2,3,4) + actual = mstats.hmean(a) + desired = 4. / (1./1 + 1./2 + 1./3 + 1./4) + assert_almost_equal(actual, desired, decimal=14) + desired1 = mstats.hmean(ma.array(a),axis=-1) + assert_almost_equal(actual, desired1, decimal=14) + + a = ma.array((1,2,3,4),mask=(0,0,0,1)) + actual = mstats.hmean(a) + desired = 3. / (1./1 + 1./2 + 1./3) + assert_almost_equal(actual, desired,decimal=14) + desired1 = mstats.hmean(a,axis=-1) + assert_almost_equal(actual, desired1, decimal=14) + + def test_2D(self): + a = ma.array(((1,2,3,4),(1,2,3,4),(1,2,3,4)), + mask=((0,0,0,0),(1,0,0,1),(0,1,1,0))) + actual = mstats.hmean(a) + desired = ma.array((1,2,3,4)) + assert_array_almost_equal(actual, desired, decimal=14) + + actual1 = mstats.hmean(a,axis=-1) + desired = (4./(1/1.+1/2.+1/3.+1/4.), + 2./(1/2.+1/3.), + 2./(1/1.+1/4.) + ) + assert_array_almost_equal(actual1, desired, decimal=14) + + +class TestRanking(TestCase): + + def __init__(self, *args, **kwargs): + TestCase.__init__(self, *args, **kwargs) + + def test_ranking(self): + x = ma.array([0,1,1,1,2,3,4,5,5,6,]) + assert_almost_equal(mstats.rankdata(x),[1,3,3,3,5,6,7,8.5,8.5,10]) + x[[3,4]] = masked + assert_almost_equal(mstats.rankdata(x),[1,2.5,2.5,0,0,4,5,6.5,6.5,8]) + assert_almost_equal(mstats.rankdata(x,use_missing=True), + [1,2.5,2.5,4.5,4.5,4,5,6.5,6.5,8]) + x = ma.array([0,1,5,1,2,4,3,5,1,6,]) + assert_almost_equal(mstats.rankdata(x),[1,3,8.5,3,5,7,6,8.5,3,10]) + x = ma.array([[0,1,1,1,2], [3,4,5,5,6,]]) + assert_almost_equal(mstats.rankdata(x),[[1,3,3,3,5],[6,7,8.5,8.5,10]]) + assert_almost_equal(mstats.rankdata(x,axis=1),[[1,3,3,3,5],[1,2,3.5,3.5,5]]) + assert_almost_equal(mstats.rankdata(x,axis=0),[[1,1,1,1,1],[2,2,2,2,2,]]) + + +class TestCorr(TestCase): + + def test_pearsonr(self): + # Tests some computations of Pearson's r + x = ma.arange(10) + with warnings.catch_warnings(): + # The tests in this context are edge cases, with perfect + # correlation or anticorrelation, or totally masked data. + # None of these should trigger a RuntimeWarning. + warnings.simplefilter("error", RuntimeWarning) + + assert_almost_equal(mstats.pearsonr(x, x)[0], 1.0) + assert_almost_equal(mstats.pearsonr(x, x[::-1])[0], -1.0) + + x = ma.array(x, mask=True) + pr = mstats.pearsonr(x, x) + assert_(pr[0] is masked) + assert_(pr[1] is masked) + + x1 = ma.array([-1.0, 0.0, 1.0]) + y1 = ma.array([0, 0, 3]) + r, p = mstats.pearsonr(x1, y1) + assert_almost_equal(r, np.sqrt(3)/2) + assert_almost_equal(p, 1.0/3) + + # (x2, y2) have the same unmasked data as (x1, y1). + mask = [False, False, False, True] + x2 = ma.array([-1.0, 0.0, 1.0, 99.0], mask=mask) + y2 = ma.array([0, 0, 3, -1], mask=mask) + r, p = mstats.pearsonr(x2, y2) + assert_almost_equal(r, np.sqrt(3)/2) + assert_almost_equal(p, 1.0/3) + + def test_spearmanr(self): + # Tests some computations of Spearman's rho + (x, y) = ([5.05,6.75,3.21,2.66],[1.65,2.64,2.64,6.95]) + assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555) + (x, y) = ([5.05,6.75,3.21,2.66,np.nan],[1.65,2.64,2.64,6.95,np.nan]) + (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) + assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555) + + x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, + 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7] + y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, + 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4] + assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299) + x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, + 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7, np.nan] + y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, + 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4, np.nan] + (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) + assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299) + + def test_kendalltau(self): + # Tests some computations of Kendall's tau + x = ma.fix_invalid([5.05, 6.75, 3.21, 2.66,np.nan]) + y = ma.fix_invalid([1.65, 26.5, -5.93, 7.96, np.nan]) + z = ma.fix_invalid([1.65, 2.64, 2.64, 6.95, np.nan]) + assert_almost_equal(np.asarray(mstats.kendalltau(x,y)), + [+0.3333333,0.4969059]) + assert_almost_equal(np.asarray(mstats.kendalltau(x,z)), + [-0.5477226,0.2785987]) + # + x = ma.fix_invalid([0, 0, 0, 0,20,20, 0,60, 0,20, + 10,10, 0,40, 0,20, 0, 0, 0, 0, 0, np.nan]) + y = ma.fix_invalid([0,80,80,80,10,33,60, 0,67,27, + 25,80,80,80,80,80,80, 0,10,45, np.nan, 0]) + result = mstats.kendalltau(x,y) + assert_almost_equal(np.asarray(result), [-0.1585188, 0.4128009]) + + def test_kendalltau_seasonal(self): + # Tests the seasonal Kendall tau. + x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1], + [4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3], + [3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan], + [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]] + x = ma.fix_invalid(x).T + output = mstats.kendalltau_seasonal(x) + assert_almost_equal(output['global p-value (indep)'], 0.008, 3) + assert_almost_equal(output['seasonal p-value'].round(2), + [0.18,0.53,0.20,0.04]) + + def test_pointbiserial(self): + x = [1,0,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0, + 0,0,0,0,1,-1] + y = [14.8,13.8,12.4,10.1,7.1,6.1,5.8,4.6,4.3,3.5,3.3,3.2,3.0, + 2.8,2.8,2.5,2.4,2.3,2.1,1.7,1.7,1.5,1.3,1.3,1.2,1.2,1.1, + 0.8,0.7,0.6,0.5,0.2,0.2,0.1,np.nan] + assert_almost_equal(mstats.pointbiserialr(x, y)[0], 0.36149, 5) + + +class TestTrimming(TestCase): + + def test_trim(self): + a = ma.arange(10) + assert_equal(mstats.trim(a), [0,1,2,3,4,5,6,7,8,9]) + a = ma.arange(10) + assert_equal(mstats.trim(a,(2,8)), [None,None,2,3,4,5,6,7,8,None]) + a = ma.arange(10) + assert_equal(mstats.trim(a,limits=(2,8),inclusive=(False,False)), + [None,None,None,3,4,5,6,7,None,None]) + a = ma.arange(10) + assert_equal(mstats.trim(a,limits=(0.1,0.2),relative=True), + [None,1,2,3,4,5,6,7,None,None]) + + a = ma.arange(12) + a[[0,-1]] = a[5] = masked + assert_equal(mstats.trim(a,(2,8)), + [None,None,2,3,4,None,6,7,8,None,None,None]) + + x = ma.arange(100).reshape(10,10) + trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=None) + assert_equal(trimx._mask.ravel(),[1]*10+[0]*70+[1]*20) + trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=0) + assert_equal(trimx._mask.ravel(),[1]*10+[0]*70+[1]*20) + trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=-1) + assert_equal(trimx._mask.T.ravel(),[1]*10+[0]*70+[1]*20) + + x = ma.arange(110).reshape(11,10) + x[1] = masked + trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=None) + assert_equal(trimx._mask.ravel(),[1]*20+[0]*70+[1]*20) + trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=0) + assert_equal(trimx._mask.ravel(),[1]*20+[0]*70+[1]*20) + trimx = mstats.trim(x.T,(0.1,0.2),relative=True,axis=-1) + assert_equal(trimx.T._mask.ravel(),[1]*20+[0]*70+[1]*20) + + def test_trim_old(self): + x = ma.arange(100) + assert_equal(mstats.trimboth(x).count(), 60) + assert_equal(mstats.trimtail(x,tail='r').count(), 80) + x[50:70] = masked + trimx = mstats.trimboth(x) + assert_equal(trimx.count(), 48) + assert_equal(trimx._mask, [1]*16 + [0]*34 + [1]*20 + [0]*14 + [1]*16) + x._mask = nomask + x.shape = (10,10) + assert_equal(mstats.trimboth(x).count(), 60) + assert_equal(mstats.trimtail(x).count(), 80) + + def test_trimmedmean(self): + data = ma.array([77, 87, 88,114,151,210,219,246,253,262, + 296,299,306,376,428,515,666,1310,2611]) + assert_almost_equal(mstats.trimmed_mean(data,0.1), 343, 0) + assert_almost_equal(mstats.trimmed_mean(data,(0.1,0.1)), 343, 0) + assert_almost_equal(mstats.trimmed_mean(data,(0.2,0.2)), 283, 0) + + def test_trimmed_stde(self): + data = ma.array([77, 87, 88,114,151,210,219,246,253,262, + 296,299,306,376,428,515,666,1310,2611]) + assert_almost_equal(mstats.trimmed_stde(data,(0.2,0.2)), 56.13193, 5) + assert_almost_equal(mstats.trimmed_stde(data,0.2), 56.13193, 5) + + def test_winsorization(self): + data = ma.array([77, 87, 88,114,151,210,219,246,253,262, + 296,299,306,376,428,515,666,1310,2611]) + assert_almost_equal(mstats.winsorize(data,(0.2,0.2)).var(ddof=1), + 21551.4, 1) + data[5] = masked + winsorized = mstats.winsorize(data) + assert_equal(winsorized.mask, data.mask) + + +class TestMoments(TestCase): + # Comparison numbers are found using R v.1.5.1 + # note that length(testcase) = 4 + # testmathworks comes from documentation for the + # Statistics Toolbox for Matlab and can be found at both + # http://www.mathworks.com/access/helpdesk/help/toolbox/stats/kurtosis.shtml + # http://www.mathworks.com/access/helpdesk/help/toolbox/stats/skewness.shtml + # Note that both test cases came from here. + testcase = [1,2,3,4] + testmathworks = ma.fix_invalid([1.165, 0.6268, 0.0751, 0.3516, -0.6965, + np.nan]) + testcase_2d = ma.array( + np.array([[0.05245846, 0.50344235, 0.86589117, 0.36936353, 0.46961149], + [0.11574073, 0.31299969, 0.45925772, 0.72618805, 0.75194407], + [0.67696689, 0.91878127, 0.09769044, 0.04645137, 0.37615733], + [0.05903624, 0.29908861, 0.34088298, 0.66216337, 0.83160998], + [0.64619526, 0.94894632, 0.27855892, 0.0706151, 0.39962917]]), + mask=np.array([[True, False, False, True, False], + [True, True, True, False, True], + [False, False, False, False, False], + [True, True, True, True, True], + [False, False, True, False, False]], dtype=np.bool)) + + def test_moment(self): + y = mstats.moment(self.testcase,1) + assert_almost_equal(y,0.0,10) + y = mstats.moment(self.testcase,2) + assert_almost_equal(y,1.25) + y = mstats.moment(self.testcase,3) + assert_almost_equal(y,0.0) + y = mstats.moment(self.testcase,4) + assert_almost_equal(y,2.5625) + + def test_variation(self): + y = mstats.variation(self.testcase) + assert_almost_equal(y,0.44721359549996, 10) + + def test_skewness(self): + y = mstats.skew(self.testmathworks) + assert_almost_equal(y,-0.29322304336607,10) + y = mstats.skew(self.testmathworks,bias=0) + assert_almost_equal(y,-0.437111105023940,10) + y = mstats.skew(self.testcase) + assert_almost_equal(y,0.0,10) + + def test_kurtosis(self): + # Set flags for axis = 0 and fisher=0 (Pearson's definition of kurtosis + # for compatibility with Matlab) + y = mstats.kurtosis(self.testmathworks,0,fisher=0,bias=1) + assert_almost_equal(y, 2.1658856802973,10) + # Note that MATLAB has confusing docs for the following case + # kurtosis(x,0) gives an unbiased estimate of Pearson's skewness + # kurtosis(x) gives a biased estimate of Fisher's skewness (Pearson-3) + # The MATLAB docs imply that both should give Fisher's + y = mstats.kurtosis(self.testmathworks,fisher=0, bias=0) + assert_almost_equal(y, 3.663542721189047,10) + y = mstats.kurtosis(self.testcase,0,0) + assert_almost_equal(y,1.64) + + # test that kurtosis works on multidimensional masked arrays + correct_2d = ma.array(np.array([-1.5, -3., -1.47247052385, 0., + -1.26979517952]), + mask=np.array([False, False, False, True, + False], dtype=np.bool)) + assert_array_almost_equal(mstats.kurtosis(self.testcase_2d, 1), + correct_2d) + for i, row in enumerate(self.testcase_2d): + assert_almost_equal(mstats.kurtosis(row), correct_2d[i]) + + correct_2d_bias_corrected = ma.array( + np.array([-1.5, -3., -1.88988209538, 0., -0.5234638463918877]), + mask=np.array([False, False, False, True, False], dtype=np.bool)) + assert_array_almost_equal(mstats.kurtosis(self.testcase_2d, 1, + bias=False), + correct_2d_bias_corrected) + for i, row in enumerate(self.testcase_2d): + assert_almost_equal(mstats.kurtosis(row, bias=False), + correct_2d_bias_corrected[i]) + + # Check consistency between stats and mstats implementations + assert_array_almost_equal_nulp(mstats.kurtosis(self.testcase_2d[2, :]), + stats.kurtosis(self.testcase_2d[2, :])) + + def test_mode(self): + a1 = [0,0,0,1,1,1,2,3,3,3,3,4,5,6,7] + a2 = np.reshape(a1, (3,5)) + a3 = np.array([1,2,3,4,5,6]) + a4 = np.reshape(a3, (3,2)) + ma1 = ma.masked_where(ma.array(a1) > 2, a1) + ma2 = ma.masked_where(a2 > 2, a2) + ma3 = ma.masked_where(a3 < 2, a3) + ma4 = ma.masked_where(ma.array(a4) < 2, a4) + assert_equal(mstats.mode(a1, axis=None), (3,4)) + assert_equal(mstats.mode(a1, axis=0), (3,4)) + assert_equal(mstats.mode(ma1, axis=None), (0,3)) + assert_equal(mstats.mode(a2, axis=None), (3,4)) + assert_equal(mstats.mode(ma2, axis=None), (0,3)) + assert_equal(mstats.mode(a3, axis=None), (1,1)) + assert_equal(mstats.mode(ma3, axis=None), (2,1)) + assert_equal(mstats.mode(a2, axis=0), ([[0,0,0,1,1]], [[1,1,1,1,1]])) + assert_equal(mstats.mode(ma2, axis=0), ([[0,0,0,1,1]], [[1,1,1,1,1]])) + assert_equal(mstats.mode(a2, axis=-1), ([[0],[3],[3]], [[3],[3],[1]])) + assert_equal(mstats.mode(ma2, axis=-1), ([[0],[1],[0]], [[3],[1],[0]])) + assert_equal(mstats.mode(ma4, axis=0), ([[3,2]], [[1,1]])) + assert_equal(mstats.mode(ma4, axis=-1), ([[2],[3],[5]], [[1],[1],[1]])) + + +class TestPercentile(TestCase): + def setUp(self): + self.a1 = [3,4,5,10,-3,-5,6] + self.a2 = [3,-6,-2,8,7,4,2,1] + self.a3 = [3.,4,5,10,-3,-5,-6,7.0] + + def test_percentile(self): + x = np.arange(8) * 0.5 + assert_equal(mstats.scoreatpercentile(x, 0), 0.) + assert_equal(mstats.scoreatpercentile(x, 100), 3.5) + assert_equal(mstats.scoreatpercentile(x, 50), 1.75) + + def test_2D(self): + x = ma.array([[1, 1, 1], + [1, 1, 1], + [4, 4, 3], + [1, 1, 1], + [1, 1, 1]]) + assert_equal(mstats.scoreatpercentile(x,50), [1,1,1]) + + +class TestVariability(TestCase): + """ Comparison numbers are found using R v.1.5.1 + note that length(testcase) = 4 + """ + testcase = ma.fix_invalid([1,2,3,4,np.nan]) + + def test_signaltonoise(self): + # This is not in R, so used: + # mean(testcase, axis=0) / (sqrt(var(testcase)*3/4)) + y = mstats.signaltonoise(self.testcase) + assert_almost_equal(y,2.236067977) + + def test_sem(self): + # This is not in R, so used: sqrt(var(testcase)*3/4) / sqrt(3) + y = mstats.sem(self.testcase) + assert_almost_equal(y, 0.6454972244) + n = self.testcase.count() + assert_allclose(mstats.sem(self.testcase, ddof=0) * np.sqrt(n/(n-2)), + mstats.sem(self.testcase, ddof=2)) + + def test_zmap(self): + # This is not in R, so tested by using: + # (testcase[i]-mean(testcase,axis=0)) / sqrt(var(testcase)*3/4) + y = mstats.zmap(self.testcase, self.testcase) + desired_unmaskedvals = ([-1.3416407864999, -0.44721359549996, + 0.44721359549996, 1.3416407864999]) + assert_array_almost_equal(desired_unmaskedvals, + y.data[y.mask == False], decimal=12) + + def test_zscore(self): + # This is not in R, so tested by using: + # (testcase[i]-mean(testcase,axis=0)) / sqrt(var(testcase)*3/4) + y = mstats.zscore(self.testcase) + desired = ma.fix_invalid([-1.3416407864999, -0.44721359549996, + 0.44721359549996, 1.3416407864999, np.nan]) + assert_almost_equal(desired, y, decimal=12) + + +class TestMisc(TestCase): + + def test_obrientransform(self): + args = [[5]*5+[6]*11+[7]*9+[8]*3+[9]*2+[10]*2, + [6]+[7]*2+[8]*4+[9]*9+[10]*16] + result = [5*[3.1828]+11*[0.5591]+9*[0.0344]+3*[1.6086]+2*[5.2817]+2*[11.0538], + [10.4352]+2*[4.8599]+4*[1.3836]+9*[0.0061]+16*[0.7277]] + assert_almost_equal(np.round(mstats.obrientransform(*args).T,4), + result,4) + + def test_kstwosamp(self): + x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1], + [4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3], + [3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan], + [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]] + x = ma.fix_invalid(x).T + (winter,spring,summer,fall) = x.T + + assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring),4), + (0.1818,0.9892)) + assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring,'g'),4), + (0.1469,0.7734)) + assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring,'l'),4), + (0.1818,0.6744)) + + def test_friedmanchisq(self): + # No missing values + args = ([9.0,9.5,5.0,7.5,9.5,7.5,8.0,7.0,8.5,6.0], + [7.0,6.5,7.0,7.5,5.0,8.0,6.0,6.5,7.0,7.0], + [6.0,8.0,4.0,6.0,7.0,6.5,6.0,4.0,6.5,3.0]) + result = mstats.friedmanchisquare(*args) + assert_almost_equal(result[0], 10.4737, 4) + assert_almost_equal(result[1], 0.005317, 6) + # Missing values + x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1], + [4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3], + [3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan], + [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]] + x = ma.fix_invalid(x) + result = mstats.friedmanchisquare(*x) + assert_almost_equal(result[0], 2.0156, 4) + assert_almost_equal(result[1], 0.5692, 4) + + +def test_regress_simple(): + # Regress a line with sinusoidal noise. Test for #1273. + x = np.linspace(0, 100, 100) + y = 0.2 * np.linspace(0, 100, 100) + 10 + y += np.sin(np.linspace(0, 20, 100)) + + slope, intercept, r_value, p_value, sterr = mstats.linregress(x, y) + assert_almost_equal(slope, 0.19644990055858422) + assert_almost_equal(intercept, 10.211269918932341) + + +def test_plotting_positions(): + # Regression test for #1256 + pos = mstats.plotting_positions(np.arange(3), 0, 0) + assert_array_almost_equal(pos.data, np.array([0.25, 0.5, 0.75])) + + +class TestNormalitytests(): + + def test_vs_nonmasked(self): + x = np.array((-2,-1,0,1,2,3)*4)**2 + assert_array_almost_equal(mstats.normaltest(x), stats.normaltest(x)) + assert_array_almost_equal(mstats.skewtest(x), stats.skewtest(x)) + assert_array_almost_equal(mstats.kurtosistest(x), + stats.kurtosistest(x)) + + funcs = [stats.normaltest, stats.skewtest, stats.kurtosistest] + mfuncs = [mstats.normaltest, mstats.skewtest, mstats.kurtosistest] + x = [1, 2, 3, 4] + for func, mfunc in zip(funcs, mfuncs): + assert_raises(ValueError, func, x) + assert_raises(ValueError, mfunc, x) + + def test_axis_None(self): + # Test axis=None (equal to axis=0 for 1-D input) + x = np.array((-2,-1,0,1,2,3)*4)**2 + assert_allclose(mstats.normaltest(x, axis=None), mstats.normaltest(x)) + assert_allclose(mstats.skewtest(x, axis=None), mstats.skewtest(x)) + assert_allclose(mstats.kurtosistest(x, axis=None), + mstats.kurtosistest(x)) + + def test_maskedarray_input(self): + # Add some masked values, test result doesn't change + x = np.array((-2,-1,0,1,2,3)*4)**2 + xm = np.ma.array(np.r_[np.inf, x, 10], + mask=np.r_[True, [False] * x.size, True]) + assert_allclose(mstats.normaltest(xm), stats.normaltest(x)) + assert_allclose(mstats.skewtest(xm), stats.skewtest(x)) + assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x)) + + def test_nd_input(self): + x = np.array((-2,-1,0,1,2,3)*4)**2 + x_2d = np.vstack([x] * 2).T + for func in [mstats.normaltest, mstats.skewtest, mstats.kurtosistest]: + res_1d = func(x) + res_2d = func(x_2d) + assert_allclose(res_2d[0], [res_1d[0]] * 2) + assert_allclose(res_2d[1], [res_1d[1]] * 2) + + +#TODO: for all ttest functions, add tests with masked array inputs +class TestTtest_rel(): + + def test_vs_nonmasked(self): + np.random.seed(1234567) + outcome = np.random.randn(20, 4) + [0, 0, 1, 2] + + # 1-D inputs + res1 = stats.ttest_rel(outcome[:, 0], outcome[:, 1]) + res2 = mstats.ttest_rel(outcome[:, 0], outcome[:, 1]) + assert_allclose(res1, res2) + + # 2-D inputs + res1 = stats.ttest_rel(outcome[:, 0], outcome[:, 1], axis=None) + res2 = mstats.ttest_rel(outcome[:, 0], outcome[:, 1], axis=None) + assert_allclose(res1, res2) + res1 = stats.ttest_rel(outcome[:, :2], outcome[:, 2:], axis=0) + res2 = mstats.ttest_rel(outcome[:, :2], outcome[:, 2:], axis=0) + assert_allclose(res1, res2) + + # Check default is axis=0 + res3 = mstats.ttest_rel(outcome[:, :2], outcome[:, 2:]) + assert_allclose(res2, res3) + + def test_invalid_input_size(self): + assert_raises(ValueError, mstats.ttest_rel, + np.arange(10), np.arange(11)) + x = np.arange(24) + assert_raises(ValueError, mstats.ttest_rel, + x.reshape(2, 3, 4), x.reshape(2, 4, 3), axis=1) + assert_raises(ValueError, mstats.ttest_rel, + x.reshape(2, 3, 4), x.reshape(2, 4, 3), axis=2) + + def test_empty(self): + res1 = mstats.ttest_rel([], []) + assert_(np.all(np.isnan(res1))) + + +class TestTtest_ind(): + + def test_vs_nonmasked(self): + np.random.seed(1234567) + outcome = np.random.randn(20, 4) + [0, 0, 1, 2] + + # 1-D inputs + res1 = stats.ttest_ind(outcome[:, 0], outcome[:, 1]) + res2 = mstats.ttest_ind(outcome[:, 0], outcome[:, 1]) + assert_allclose(res1, res2) + + # 2-D inputs + res1 = stats.ttest_ind(outcome[:, 0], outcome[:, 1], axis=None) + res2 = mstats.ttest_ind(outcome[:, 0], outcome[:, 1], axis=None) + assert_allclose(res1, res2) + res1 = stats.ttest_ind(outcome[:, :2], outcome[:, 2:], axis=0) + res2 = mstats.ttest_ind(outcome[:, :2], outcome[:, 2:], axis=0) + assert_allclose(res1, res2) + + # Check default is axis=0 + res3 = mstats.ttest_ind(outcome[:, :2], outcome[:, 2:]) + assert_allclose(res2, res3) + + def test_empty(self): + res1 = mstats.ttest_ind([], []) + assert_(np.all(np.isnan(res1))) + + +class TestTtest_1samp(): + + def test_vs_nonmasked(self): + np.random.seed(1234567) + outcome = np.random.randn(20, 4) + [0, 0, 1, 2] + + # 1-D inputs + res1 = stats.ttest_1samp(outcome[:, 0], 1) + res2 = mstats.ttest_1samp(outcome[:, 0], 1) + assert_allclose(res1, res2) + + # 2-D inputs + res1 = stats.ttest_1samp(outcome[:, 0], outcome[:, 1], axis=None) + res2 = mstats.ttest_1samp(outcome[:, 0], outcome[:, 1], axis=None) + assert_allclose(res1, res2) + res1 = stats.ttest_1samp(outcome[:, :2], outcome[:, 2:], axis=0) + res2 = mstats.ttest_1samp(outcome[:, :2], outcome[:, 2:], axis=0) + assert_allclose(res1, res2) + + # Check default is axis=0 + res3 = mstats.ttest_1samp(outcome[:, :2], outcome[:, 2:]) + assert_allclose(res2, res3) + + def test_empty(self): + res1 = mstats.ttest_1samp([], 1) + assert_(np.all(np.isnan(res1))) + + +if __name__ == "__main__": + run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_mstats_extras.py b/pywafo/src/wafo/stats/tests/test_mstats_extras.py index 2fe5d72..2ab5762 100644 --- a/pywafo/src/wafo/stats/tests/test_mstats_extras.py +++ b/pywafo/src/wafo/stats/tests/test_mstats_extras.py @@ -1,103 +1,107 @@ -# pylint: disable-msg=W0611, W0612, W0511,R0201 -"""Tests suite for maskedArray statistics. - -:author: Pierre Gerard-Marchant -:contact: pierregm_at_uga_dot_edu -""" -__author__ = "Pierre GF Gerard-Marchant ($Author: backtopop $)" - -import numpy as np - -import numpy.ma as ma - -import scipy.stats.mstats as ms -#import scipy.stats.mmorestats as mms - -from numpy.testing import TestCase, run_module_suite, assert_equal, \ - assert_almost_equal, assert_ - - -class TestMisc(TestCase): - # - def __init__(self, *args, **kwargs): - TestCase.__init__(self, *args, **kwargs) - # - def test_mjci(self): - "Tests the Marits-Jarrett estimator" - data = ma.array([ 77, 87, 88,114,151,210,219,246,253,262, - 296,299,306,376,428,515,666,1310,2611]) - assert_almost_equal(ms.mjci(data),[55.76819,45.84028,198.87875],5) - # - def test_trimmedmeanci(self): - "Tests the confidence intervals of the trimmed mean." - data = ma.array([545,555,558,572,575,576,578,580, - 594,605,635,651,653,661,666]) - assert_almost_equal(ms.trimmed_mean(data,0.2), 596.2, 1) - assert_equal(np.round(ms.trimmed_mean_ci(data,(0.2,0.2)),1), - [561.8, 630.6]) - # - def test_idealfourths(self): - "Tests ideal-fourths" - test = np.arange(100) - assert_almost_equal(np.asarray(ms.idealfourths(test)), - [24.416667,74.583333],6) - test_2D = test.repeat(3).reshape(-1,3) - assert_almost_equal(ms.idealfourths(test_2D, axis=0), - [[24.416667,24.416667,24.416667], - [74.583333,74.583333,74.583333]],6) - assert_almost_equal(ms.idealfourths(test_2D, axis=1), - test.repeat(2).reshape(-1,2)) - test = [0,0] - _result = ms.idealfourths(test) - assert_(np.isnan(_result).all()) - -#.............................................................................. -class TestQuantiles(TestCase): - # - def __init__(self, *args, **kwargs): - TestCase.__init__(self, *args, **kwargs) - # - def test_hdquantiles(self): - data = [0.706560797,0.727229578,0.990399276,0.927065621,0.158953014, - 0.887764025,0.239407086,0.349638551,0.972791145,0.149789972, - 0.936947700,0.132359948,0.046041972,0.641675031,0.945530547, - 0.224218684,0.771450991,0.820257774,0.336458052,0.589113496, - 0.509736129,0.696838829,0.491323573,0.622767425,0.775189248, - 0.641461450,0.118455200,0.773029450,0.319280007,0.752229111, - 0.047841438,0.466295911,0.583850781,0.840581845,0.550086491, - 0.466470062,0.504765074,0.226855960,0.362641207,0.891620942, - 0.127898691,0.490094097,0.044882048,0.041441695,0.317976349, - 0.504135618,0.567353033,0.434617473,0.636243375,0.231803616, - 0.230154113,0.160011327,0.819464108,0.854706985,0.438809221, - 0.487427267,0.786907310,0.408367937,0.405534192,0.250444460, - 0.995309248,0.144389588,0.739947527,0.953543606,0.680051621, - 0.388382017,0.863530727,0.006514031,0.118007779,0.924024803, - 0.384236354,0.893687694,0.626534881,0.473051932,0.750134705, - 0.241843555,0.432947602,0.689538104,0.136934797,0.150206859, - 0.474335206,0.907775349,0.525869295,0.189184225,0.854284286, - 0.831089744,0.251637345,0.587038213,0.254475554,0.237781276, - 0.827928620,0.480283781,0.594514455,0.213641488,0.024194386, - 0.536668589,0.699497811,0.892804071,0.093835427,0.731107772] - # - assert_almost_equal(ms.hdquantiles(data,[0., 1.]), - [0.006514031, 0.995309248]) - hdq = ms.hdquantiles(data,[0.25, 0.5, 0.75]) - assert_almost_equal(hdq, [0.253210762, 0.512847491, 0.762232442,]) - hdq = ms.hdquantiles_sd(data,[0.25, 0.5, 0.75]) - assert_almost_equal(hdq, [0.03786954, 0.03805389, 0.03800152,], 4) - # - data = np.array(data).reshape(10,10) - hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0) - assert_almost_equal(hdq[:,0], ms.hdquantiles(data[:,0],[0.25,0.5,0.75])) - assert_almost_equal(hdq[:,-1], ms.hdquantiles(data[:,-1],[0.25,0.5,0.75])) - hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0,var=True) - assert_almost_equal(hdq[...,0], - ms.hdquantiles(data[:,0],[0.25,0.5,0.75],var=True)) - assert_almost_equal(hdq[...,-1], - ms.hdquantiles(data[:,-1],[0.25,0.5,0.75], var=True)) - - -############################################################################### - -if __name__ == "__main__": - run_module_suite() +# pylint: disable-msg=W0611, W0612, W0511,R0201 +"""Tests suite for maskedArray statistics. + +:author: Pierre Gerard-Marchant +:contact: pierregm_at_uga_dot_edu +""" +from __future__ import division, print_function, absolute_import + +__author__ = "Pierre GF Gerard-Marchant ($Author: backtopop $)" + +import numpy as np + +import numpy.ma as ma + +import wafo.stats.mstats as ms +#import wafo.stats.mmorestats as mms + +from numpy.testing import TestCase, run_module_suite, assert_equal, \ + assert_almost_equal, assert_ + + +class TestMisc(TestCase): + + def __init__(self, *args, **kwargs): + TestCase.__init__(self, *args, **kwargs) + + def test_mjci(self): + "Tests the Marits-Jarrett estimator" + data = ma.array([77, 87, 88,114,151,210,219,246,253,262, + 296,299,306,376,428,515,666,1310,2611]) + assert_almost_equal(ms.mjci(data),[55.76819,45.84028,198.87875],5) + + def test_trimmedmeanci(self): + "Tests the confidence intervals of the trimmed mean." + data = ma.array([545,555,558,572,575,576,578,580, + 594,605,635,651,653,661,666]) + assert_almost_equal(ms.trimmed_mean(data,0.2), 596.2, 1) + assert_equal(np.round(ms.trimmed_mean_ci(data,(0.2,0.2)),1), + [561.8, 630.6]) + + def test_idealfourths(self): + "Tests ideal-fourths" + test = np.arange(100) + assert_almost_equal(np.asarray(ms.idealfourths(test)), + [24.416667,74.583333],6) + test_2D = test.repeat(3).reshape(-1,3) + assert_almost_equal(ms.idealfourths(test_2D, axis=0), + [[24.416667,24.416667,24.416667], + [74.583333,74.583333,74.583333]],6) + assert_almost_equal(ms.idealfourths(test_2D, axis=1), + test.repeat(2).reshape(-1,2)) + test = [0,0] + _result = ms.idealfourths(test) + assert_(np.isnan(_result).all()) + +#.............................................................................. + + +class TestQuantiles(TestCase): + + def __init__(self, *args, **kwargs): + TestCase.__init__(self, *args, **kwargs) + + def test_hdquantiles(self): + data = [0.706560797,0.727229578,0.990399276,0.927065621,0.158953014, + 0.887764025,0.239407086,0.349638551,0.972791145,0.149789972, + 0.936947700,0.132359948,0.046041972,0.641675031,0.945530547, + 0.224218684,0.771450991,0.820257774,0.336458052,0.589113496, + 0.509736129,0.696838829,0.491323573,0.622767425,0.775189248, + 0.641461450,0.118455200,0.773029450,0.319280007,0.752229111, + 0.047841438,0.466295911,0.583850781,0.840581845,0.550086491, + 0.466470062,0.504765074,0.226855960,0.362641207,0.891620942, + 0.127898691,0.490094097,0.044882048,0.041441695,0.317976349, + 0.504135618,0.567353033,0.434617473,0.636243375,0.231803616, + 0.230154113,0.160011327,0.819464108,0.854706985,0.438809221, + 0.487427267,0.786907310,0.408367937,0.405534192,0.250444460, + 0.995309248,0.144389588,0.739947527,0.953543606,0.680051621, + 0.388382017,0.863530727,0.006514031,0.118007779,0.924024803, + 0.384236354,0.893687694,0.626534881,0.473051932,0.750134705, + 0.241843555,0.432947602,0.689538104,0.136934797,0.150206859, + 0.474335206,0.907775349,0.525869295,0.189184225,0.854284286, + 0.831089744,0.251637345,0.587038213,0.254475554,0.237781276, + 0.827928620,0.480283781,0.594514455,0.213641488,0.024194386, + 0.536668589,0.699497811,0.892804071,0.093835427,0.731107772] + # + assert_almost_equal(ms.hdquantiles(data,[0., 1.]), + [0.006514031, 0.995309248]) + hdq = ms.hdquantiles(data,[0.25, 0.5, 0.75]) + assert_almost_equal(hdq, [0.253210762, 0.512847491, 0.762232442,]) + hdq = ms.hdquantiles_sd(data,[0.25, 0.5, 0.75]) + assert_almost_equal(hdq, [0.03786954, 0.03805389, 0.03800152,], 4) + # + data = np.array(data).reshape(10,10) + hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0) + assert_almost_equal(hdq[:,0], ms.hdquantiles(data[:,0],[0.25,0.5,0.75])) + assert_almost_equal(hdq[:,-1], ms.hdquantiles(data[:,-1],[0.25,0.5,0.75])) + hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0,var=True) + assert_almost_equal(hdq[...,0], + ms.hdquantiles(data[:,0],[0.25,0.5,0.75],var=True)) + assert_almost_equal(hdq[...,-1], + ms.hdquantiles(data[:,-1],[0.25,0.5,0.75], var=True)) + + +############################################################################### + +if __name__ == "__main__": + run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_multivariate.py b/pywafo/src/wafo/stats/tests/test_multivariate.py new file mode 100644 index 0000000..b6a9a9b --- /dev/null +++ b/pywafo/src/wafo/stats/tests/test_multivariate.py @@ -0,0 +1,275 @@ +""" +Test functions for multivariate normal distributions. + +""" +from __future__ import division, print_function, absolute_import + +from numpy.testing import (assert_almost_equal, + run_module_suite, assert_allclose, assert_equal, assert_raises) + +import numpy +import numpy as np + +import scipy.linalg +import scipy.stats._multivariate +from scipy.stats import multivariate_normal +from scipy.stats import norm + +from scipy.stats._multivariate import _psd_pinv_decomposed_log_pdet + +from scipy.integrate import romb + + +def test_scalar_values(): + np.random.seed(1234) + + # When evaluated on scalar data, the pdf should return a scalar + x, mean, cov = 1.5, 1.7, 2.5 + pdf = multivariate_normal.pdf(x, mean, cov) + assert_equal(pdf.ndim, 0) + + # When evaluated on a single vector, the pdf should return a scalar + x = np.random.randn(5) + mean = np.random.randn(5) + cov = np.abs(np.random.randn(5)) # Diagonal values for cov. matrix + pdf = multivariate_normal.pdf(x, mean, cov) + assert_equal(pdf.ndim, 0) + + +def test_logpdf(): + # Check that the log of the pdf is in fact the logpdf + np.random.seed(1234) + x = np.random.randn(5) + mean = np.random.randn(5) + cov = np.abs(np.random.randn(5)) + d1 = multivariate_normal.logpdf(x, mean, cov) + d2 = multivariate_normal.pdf(x, mean, cov) + assert_allclose(d1, np.log(d2)) + + +def test_large_pseudo_determinant(): + # Check that large pseudo-determinants are handled appropriately. + + # Construct a singular diagonal covariance matrix + # whose pseudo determinant overflows double precision. + large_total_log = 1000.0 + npos = 100 + nzero = 2 + large_entry = np.exp(large_total_log / npos) + n = npos + nzero + cov = np.zeros((n, n), dtype=float) + np.fill_diagonal(cov, large_entry) + cov[-nzero:, -nzero:] = 0 + + # Check some determinants. + assert_equal(scipy.linalg.det(cov), 0) + assert_equal(scipy.linalg.det(cov[:npos, :npos]), np.inf) + + # np.linalg.slogdet is only available in numpy 1.6+ + # but scipy currently supports numpy 1.5.1. + #assert_allclose(np.linalg.slogdet(cov[:npos, :npos]), (1, large_total_log)) + + # Check the pseudo-determinant. + U, log_pdet = scipy.stats._multivariate._psd_pinv_decomposed_log_pdet(cov) + assert_allclose(log_pdet, large_total_log) + + +def test_broadcasting(): + np.random.seed(1234) + n = 4 + + # Construct a random covariance matrix. + data = np.random.randn(n, n) + cov = np.dot(data, data.T) + mean = np.random.randn(n) + + # Construct an ndarray which can be interpreted as + # a 2x3 array whose elements are random data vectors. + X = np.random.randn(2, 3, n) + + # Check that multiple data points can be evaluated at once. + for i in range(2): + for j in range(3): + actual = multivariate_normal.pdf(X[i, j], mean, cov) + desired = multivariate_normal.pdf(X, mean, cov)[i, j] + assert_allclose(actual, desired) + + +def test_normal_1D(): + # The probability density function for a 1D normal variable should + # agree with the standard normal distribution in scipy.stats.distributions + x = np.linspace(0, 2, 10) + mean, cov = 1.2, 0.9 + scale = cov**0.5 + d1 = norm.pdf(x, mean, scale) + d2 = multivariate_normal.pdf(x, mean, cov) + assert_allclose(d1, d2) + + +def test_marginalization(): + # Integrating out one of the variables of a 2D Gaussian should + # yield a 1D Gaussian + mean = np.array([2.5, 3.5]) + cov = np.array([[.5, 0.2], [0.2, .6]]) + n = 2**8 + 1 # Number of samples + delta = 6 / (n - 1) # Grid spacing + + v = np.linspace(0, 6, n) + xv, yv = np.meshgrid(v, v) + pos = np.empty((n, n, 2)) + pos[:, :, 0] = xv + pos[:, :, 1] = yv + pdf = multivariate_normal.pdf(pos, mean, cov) + + # Marginalize over x and y axis + margin_x = romb(pdf, delta, axis=0) + margin_y = romb(pdf, delta, axis=1) + + # Compare with standard normal distribution + gauss_x = norm.pdf(v, loc=mean[0], scale=cov[0, 0]**0.5) + gauss_y = norm.pdf(v, loc=mean[1], scale=cov[1, 1]**0.5) + assert_allclose(margin_x, gauss_x, rtol=1e-2, atol=1e-2) + assert_allclose(margin_y, gauss_y, rtol=1e-2, atol=1e-2) + + +def test_frozen(): + # The frozen distribution should agree with the regular one + np.random.seed(1234) + x = np.random.randn(5) + mean = np.random.randn(5) + cov = np.abs(np.random.randn(5)) + norm_frozen = multivariate_normal(mean, cov) + assert_allclose(norm_frozen.pdf(x), multivariate_normal.pdf(x, mean, cov)) + assert_allclose(norm_frozen.logpdf(x), + multivariate_normal.logpdf(x, mean, cov)) + + +def test_pseudodet_pinv(): + # Make sure that pseudo-inverse and pseudo-det agree on cutoff + + # Assemble random covariance matrix with large and small eigenvalues + np.random.seed(1234) + n = 7 + x = np.random.randn(n, n) + cov = np.dot(x, x.T) + s, u = scipy.linalg.eigh(cov) + s = 0.5 * np.ones(n) + s[0] = 1.0 + s[-1] = 1e-7 + cov = np.dot(u, np.dot(np.diag(s), u.T)) + + # Set cond so that the lowest eigenvalue is below the cutoff + cond = 1e-5 + U, log_pdet = _psd_pinv_decomposed_log_pdet(cov, cond) + pinv = np.dot(U, U.T) + _, log_pdet_pinv = _psd_pinv_decomposed_log_pdet(pinv, cond) + + # Check that the log pseudo-determinant agrees with the sum + # of the logs of all but the smallest eigenvalue + assert_allclose(log_pdet, np.sum(np.log(s[:-1]))) + # Check that the pseudo-determinant of the pseudo-inverse + # agrees with 1 / pseudo-determinant + assert_allclose(-log_pdet, log_pdet_pinv) + + +def test_exception_nonsquare_cov(): + cov = [[1, 2, 3], [4, 5, 6]] + assert_raises(ValueError, _psd_pinv_decomposed_log_pdet, cov) + + +def test_exception_nonfinite_cov(): + cov_nan = [[1, 0], [0, np.nan]] + assert_raises(ValueError, _psd_pinv_decomposed_log_pdet, cov_nan) + cov_inf = [[1, 0], [0, np.inf]] + assert_raises(ValueError, _psd_pinv_decomposed_log_pdet, cov_inf) + + +def test_exception_non_psd_cov(): + cov = [[1, 0], [0, -1]] + assert_raises(ValueError, _psd_pinv_decomposed_log_pdet, cov) + + +def test_R_values(): + # Compare the multivariate pdf with some values precomputed + # in R version 3.0.1 (2013-05-16) on Mac OS X 10.6. + + # The values below were generated by the following R-script: + # > library(mnormt) + # > x <- seq(0, 2, length=5) + # > y <- 3*x - 2 + # > z <- x + cos(y) + # > mu <- c(1, 3, 2) + # > Sigma <- matrix(c(1,2,0,2,5,0.5,0,0.5,3), 3, 3) + # > r_pdf <- dmnorm(cbind(x,y,z), mu, Sigma) + r_pdf = np.array([0.0002214706, 0.0013819953, 0.0049138692, + 0.0103803050, 0.0140250800]) + + x = np.linspace(0, 2, 5) + y = 3 * x - 2 + z = x + np.cos(y) + r = np.array([x, y, z]).T + + mean = np.array([1, 3, 2], 'd') + cov = np.array([[1, 2, 0], [2, 5, .5], [0, .5, 3]], 'd') + + pdf = multivariate_normal.pdf(r, mean, cov) + assert_allclose(pdf, r_pdf, atol=1e-10) + + +def test_rvs_shape(): + # Check that rvs parses the mean and covariance correctly, and returns + # an array of the right shape + N = 300 + d = 4 + sample = multivariate_normal.rvs(mean=np.zeros(d), cov=1, size=N) + assert_equal(sample.shape, (N, d)) + + sample = multivariate_normal.rvs(mean=None, + cov=np.array([[2, .1], [.1, 1]]), + size=N) + assert_equal(sample.shape, (N, 2)) + + u = multivariate_normal(mean=0, cov=1) + sample = u.rvs(N) + assert_equal(sample.shape, (N, )) + + +def test_large_sample(): + # Generate large sample and compare sample mean and sample covariance + # with mean and covariance matrix. + + np.random.seed(2846) + + n = 3 + mean = np.random.randn(n) + M = np.random.randn(n, n) + cov = np.dot(M, M.T) + size = 5000 + + sample = multivariate_normal.rvs(mean, cov, size) + + assert_allclose(numpy.cov(sample.T), cov, rtol=1e-1) + assert_allclose(sample.mean(0), mean, rtol=1e-1) + + +def test_entropy(): + np.random.seed(2846) + + n = 3 + mean = np.random.randn(n) + M = np.random.randn(n, n) + cov = np.dot(M, M.T) + + rv = multivariate_normal(mean, cov) + + # Check that frozen distribution agrees with entropy function + assert_almost_equal(rv.entropy(), multivariate_normal.entropy(mean, cov)) + # Compare entropy with manually computed expression involving + # the sum of the logs of the eigenvalues of the covariance matrix + eigs = np.linalg.eig(cov)[0] + desired = 1/2 * (n * (np.log(2*np.pi) + 1) + np.sum(np.log(eigs))) + assert_almost_equal(desired, rv.entropy()) + + +if __name__ == "__main__": + run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_rank.py b/pywafo/src/wafo/stats/tests/test_rank.py new file mode 100644 index 0000000..0561bd7 --- /dev/null +++ b/pywafo/src/wafo/stats/tests/test_rank.py @@ -0,0 +1,193 @@ +from __future__ import division, print_function, absolute_import + +import numpy as np +from numpy.testing import TestCase, run_module_suite, assert_equal, \ + assert_array_equal + +from scipy.stats import rankdata, tiecorrect + + +class TestTieCorrect(TestCase): + + def test_empty(self): + """An empty array requires no correction, should return 1.0.""" + ranks = np.array([], dtype=np.float64) + c = tiecorrect(ranks) + assert_equal(c, 1.0) + + def test_one(self): + """A single element requires no correction, should return 1.0.""" + ranks = np.array([1.0], dtype=np.float64) + c = tiecorrect(ranks) + assert_equal(c, 1.0) + + def test_no_correction(self): + """Arrays with no ties require no correction.""" + ranks = np.arange(2.0) + c = tiecorrect(ranks) + assert_equal(c, 1.0) + ranks = np.arange(3.0) + c = tiecorrect(ranks) + assert_equal(c, 1.0) + + def test_basic(self): + """Check a few basic examples of the tie correction factor.""" + # One tie of two elements + ranks = np.array([1.0, 2.5, 2.5]) + c = tiecorrect(ranks) + T = 2.0 + N = ranks.size + expected = 1.0 - (T**3 - T) / (N**3 - N) + assert_equal(c, expected) + + # One tie of two elements (same as above, but tie is not at the end) + ranks = np.array([1.5, 1.5, 3.0]) + c = tiecorrect(ranks) + T = 2.0 + N = ranks.size + expected = 1.0 - (T**3 - T) / (N**3 - N) + assert_equal(c, expected) + + # One tie of three elements + ranks = np.array([1.0, 3.0, 3.0, 3.0]) + c = tiecorrect(ranks) + T = 3.0 + N = ranks.size + expected = 1.0 - (T**3 - T) / (N**3 - N) + assert_equal(c, expected) + + # Two ties, lengths 2 and 3. + ranks = np.array([1.5, 1.5, 4.0, 4.0, 4.0]) + c = tiecorrect(ranks) + T1 = 2.0 + T2 = 3.0 + N = ranks.size + expected = 1.0 - ((T1**3 - T1) + (T2**3 - T2)) / (N**3 - N) + assert_equal(c, expected) + + +class TestRankData(TestCase): + + def test_empty(self): + """stats.rankdata([]) should return an empty array.""" + a = np.array([], dtype=np.int) + r = rankdata(a) + assert_array_equal(r, np.array([], dtype=np.float64)) + r = rankdata([]) + assert_array_equal(r, np.array([], dtype=np.float64)) + + def test_one(self): + """Check stats.rankdata with an array of length 1.""" + data = [100] + a = np.array(data, dtype=np.int) + r = rankdata(a) + assert_array_equal(r, np.array([1.0], dtype=np.float64)) + r = rankdata(data) + assert_array_equal(r, np.array([1.0], dtype=np.float64)) + + def test_basic(self): + """Basic tests of stats.rankdata.""" + data = [100, 10, 50] + expected = np.array([3.0, 1.0, 2.0], dtype=np.float64) + a = np.array(data, dtype=np.int) + r = rankdata(a) + assert_array_equal(r, expected) + r = rankdata(data) + assert_array_equal(r, expected) + + data = [40, 10, 30, 10, 50] + expected = np.array([4.0, 1.5, 3.0, 1.5, 5.0], dtype=np.float64) + a = np.array(data, dtype=np.int) + r = rankdata(a) + assert_array_equal(r, expected) + r = rankdata(data) + assert_array_equal(r, expected) + + data = [20, 20, 20, 10, 10, 10] + expected = np.array([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=np.float64) + a = np.array(data, dtype=np.int) + r = rankdata(a) + assert_array_equal(r, expected) + r = rankdata(data) + assert_array_equal(r, expected) + # The docstring states explicitly that the argument is flattened. + a2d = a.reshape(2, 3) + r = rankdata(a2d) + assert_array_equal(r, expected) + + def test_large_int(self): + data = np.array([2**60, 2**60+1], dtype=np.uint64) + r = rankdata(data) + assert_array_equal(r, [1.0, 2.0]) + + data = np.array([2**60, 2**60+1], dtype=np.int64) + r = rankdata(data) + assert_array_equal(r, [1.0, 2.0]) + + data = np.array([2**60, -2**60+1], dtype=np.int64) + r = rankdata(data) + assert_array_equal(r, [2.0, 1.0]) + + def test_big_tie(self): + for n in [10000, 100000, 1000000]: + data = np.ones(n, dtype=int) + r = rankdata(data) + expected_rank = 0.5 * (n + 1) + assert_array_equal(r, expected_rank * data, + "test failed with n=%d" % n) + + +_cases = ( + # values, method, expected + ([], 'average', []), + ([], 'min', []), + ([], 'max', []), + ([], 'dense', []), + ([], 'ordinal', []), + # + ([100], 'average', [1.0]), + ([100], 'min', [1.0]), + ([100], 'max', [1.0]), + ([100], 'dense', [1.0]), + ([100], 'ordinal', [1.0]), + # + ([100, 100, 100], 'average', [2.0, 2.0, 2.0]), + ([100, 100, 100], 'min', [1.0, 1.0, 1.0]), + ([100, 100, 100], 'max', [3.0, 3.0, 3.0]), + ([100, 100, 100], 'dense', [1.0, 1.0, 1.0]), + ([100, 100, 100], 'ordinal', [1.0, 2.0, 3.0]), + # + ([100, 300, 200], 'average', [1.0, 3.0, 2.0]), + ([100, 300, 200], 'min', [1.0, 3.0, 2.0]), + ([100, 300, 200], 'max', [1.0, 3.0, 2.0]), + ([100, 300, 200], 'dense', [1.0, 3.0, 2.0]), + ([100, 300, 200], 'ordinal', [1.0, 3.0, 2.0]), + # + ([100, 200, 300, 200], 'average', [1.0, 2.5, 4.0, 2.5]), + ([100, 200, 300, 200], 'min', [1.0, 2.0, 4.0, 2.0]), + ([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]), + ([100, 200, 300, 200], 'dense', [1.0, 2.0, 3.0, 2.0]), + ([100, 200, 300, 200], 'ordinal', [1.0, 2.0, 4.0, 3.0]), + # + ([100, 200, 300, 200, 100], 'average', [1.5, 3.5, 5.0, 3.5, 1.5]), + ([100, 200, 300, 200, 100], 'min', [1.0, 3.0, 5.0, 3.0, 1.0]), + ([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]), + ([100, 200, 300, 200, 100], 'dense', [1.0, 2.0, 3.0, 2.0, 1.0]), + ([100, 200, 300, 200, 100], 'ordinal', [1.0, 3.0, 5.0, 4.0, 2.0]), + # + ([10] * 30, 'ordinal', np.arange(1.0, 31.0)), +) + + +def test_cases(): + + def check_case(values, method, expected): + r = rankdata(values, method=method) + assert_array_equal(r, expected) + + for values, method, expected in _cases: + yield check_case, values, method, expected + + +if __name__ == "__main__": + run_module_suite() diff --git a/pywafo/src/wafo/stats/tests/test_stats.py b/pywafo/src/wafo/stats/tests/test_stats.py index e716d7d..5ecfe91 100644 --- a/pywafo/src/wafo/stats/tests/test_stats.py +++ b/pywafo/src/wafo/stats/tests/test_stats.py @@ -8,19 +8,21 @@ """ from __future__ import division, print_function, absolute_import -import sys +import warnings +from collections import namedtuple from numpy.testing import TestCase, assert_, assert_equal, \ assert_almost_equal, assert_array_almost_equal, assert_array_equal, \ assert_approx_equal, assert_raises, run_module_suite, \ assert_allclose, dec +import numpy.ma.testutils as mat from numpy import array, arange, float32, float64, power import numpy as np import wafo.stats as stats -""" Numbers in docstrings begining with 'W' refer to the section numbers +""" Numbers in docstrings beginning with 'W' refer to the section numbers and headings found in the STATISTICS QUIZ of Leland Wilkinson. These are considered to be essential functionality. True testing and evaluation of a statistics package requires use of the @@ -29,111 +31,79 @@ import wafo.stats as stats implementation in testing SAS, SPSS, and S-Plus """ -## Datasets -## These data sets are from the nasty.dat sets used by Wilkinson -## for MISS, need to be able to represent missing values -## For completeness, I should write the relevant tests and count them as failures -## Somewhat acceptable, since this is still beta software. It would count as a -## good target for 1.0 status -X = array([1,2,3,4,5,6,7,8,9],float) -ZERO= array([0,0,0,0,0,0,0,0,0], float) -#MISS=array([.,.,.,.,.,.,.,.,.], float) -BIG=array([99999991,99999992,99999993,99999994,99999995,99999996,99999997,99999998,99999999],float) -LITTLE=array([0.99999991,0.99999992,0.99999993,0.99999994,0.99999995,0.99999996,0.99999997,0.99999998,0.99999999],float) -HUGE=array([1e+12,2e+12,3e+12,4e+12,5e+12,6e+12,7e+12,8e+12,9e+12],float) -TINY=array([1e-12,2e-12,3e-12,4e-12,5e-12,6e-12,7e-12,8e-12,9e-12],float) -ROUND=array([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5],float) -X2 = X * X -X3 = X2 * X -X4 = X3 * X -X5 = X4 * X -X6 = X5 * X -X7 = X6 * X -X8 = X7 * X -X9 = X8 * X - -class TestRound(TestCase): - """ W.II. ROUND - - You should get the numbers 1 to 9. Many language compilers, - such as Turbo Pascal and Lattice C, fail this test (they round - numbers inconsistently). Needless to say, statical packages - written in these languages may fail the test as well. You can - also check the following expressions: - Y = INT(2.6*7 -0.2) (Y should be 18) - Y = 2-INT(EXP(LOG(SQR(2)*SQR(2)))) (Y should be 0) - Y = INT(3-EXP(LOG(SQR(2)*SQR(2)))) (Y should be 1) - INT is the integer function. It converts decimal numbers to - integers by throwing away numbers after the decimal point. EXP - is exponential, LOG is logarithm, and SQR is suqare root. You may - have to substitute similar names for these functions for different - packages. Since the square of a square root should return the same - number, and the exponential of a log should return the same number, - we should get back a 2 from this function of functions. By taking - the integer result and subtracting from 2, we are exposing the - roundoff errors. These simple functions are at the heart of - statistical calculations. - """ - - def test_rounding0(self): - """ W.II.A.0. Print ROUND with only one digit. - - You should get the numbers 1 to 9. Many language compilers, - such as Turbo Pascal and Lattice C, fail this test (they round - numbers inconsistently). Needless to say, statical packages - written in these languages may fail the test as well. - """ - if sys.version_info[0] >= 3: - # round to even - for i in range(0,9): - y = round(ROUND[i]) - assert_equal(y, 2*((i+1)//2)) - else: - for i in range(0,9): - y = round(ROUND[i]) - assert_equal(y,i+1) - - def test_rounding1(self): - """ W.II.A.1. Y = INT(2.6*7 -0.2) (Y should be 18)""" - y = int(2.6*7 -0.2) - assert_equal(y, 18) - - def test_rounding2(self): - """ W.II.A.2. Y = 2-INT(EXP(LOG(SQR(2)*SQR(2)))) (Y should be 0)""" - y=2-int(np.exp(np.log(np.sqrt(2.)*np.sqrt(2.)))) - assert_equal(y,0) - - def test_rounding3(self): - """ W.II.A.3. Y = INT(3-EXP(LOG(SQR(2)*SQR(2)))) (Y should be 1)""" - y=(int(round((3-np.exp(np.log(np.sqrt(2.0)*np.sqrt(2.0))))))) - assert_equal(y,1) - -class TestBasicStats(TestCase): - """ W.II.C. Compute basic statistic on all the variables. - - The means should be the fifth value of all the variables (case FIVE). - The standard deviations should be "undefined" or missing for MISS, - 0 for ZERO, and 2.738612788 (times 10 to a power) for all the other variables. - II. C. Basic Statistics - """ - +# Datasets +# These data sets are from the nasty.dat sets used by Wilkinson +# For completeness, I should write the relevant tests and count them as failures +# Somewhat acceptable, since this is still beta software. It would count as a +# good target for 1.0 status +X = array([1,2,3,4,5,6,7,8,9], float) +ZERO = array([0,0,0,0,0,0,0,0,0], float) +BIG = array([99999991,99999992,99999993,99999994,99999995,99999996,99999997, + 99999998,99999999], float) +LITTLE = array([0.99999991,0.99999992,0.99999993,0.99999994,0.99999995,0.99999996, + 0.99999997,0.99999998,0.99999999], float) +HUGE = array([1e+12,2e+12,3e+12,4e+12,5e+12,6e+12,7e+12,8e+12,9e+12], float) +TINY = array([1e-12,2e-12,3e-12,4e-12,5e-12,6e-12,7e-12,8e-12,9e-12], float) +ROUND = array([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5], float) + + +class TestTrimmedStats(TestCase): + # TODO: write these tests to handle missing values properly dprec = np.finfo(np.float64).precision - # Really need to write these tests to handle missing values properly - def test_tmeanX(self): + def test_tmean(self): y = stats.tmean(X, (2, 8), (True, True)) - assert_approx_equal(y, 5.0, significant=TestBasicStats.dprec) + assert_approx_equal(y, 5.0, significant=self.dprec) + + y1 = stats.tmean(X, limits=(2, 8), inclusive=(False, False)) + y2 = stats.tmean(X, limits=None) + assert_approx_equal(y1, y2, significant=self.dprec) - def test_tvarX(self): - y = stats.tvar(X, (2, 8), (True, True)) - assert_approx_equal(y, 4.6666666666666661, - significant=TestBasicStats.dprec) + def test_tvar(self): + y = stats.tvar(X, limits=(2, 8), inclusive=(True, True)) + assert_approx_equal(y, 4.6666666666666661, significant=self.dprec) - def test_tstdX(self): + y = stats.tvar(X, limits=None) + assert_approx_equal(y, X.var(ddof=1), significant=self.dprec) + + def test_tstd(self): y = stats.tstd(X, (2, 8), (True, True)) - assert_approx_equal(y, 2.1602468994692865, - significant=TestBasicStats.dprec) + assert_approx_equal(y, 2.1602468994692865, significant=self.dprec) + + y = stats.tstd(X, limits=None) + assert_approx_equal(y, X.std(ddof=1), significant=self.dprec) + def test_tmin(self): + x = np.arange(10) + assert_equal(stats.tmin(x), 0) + assert_equal(stats.tmin(x, lowerlimit=0), 0) + assert_equal(stats.tmin(x, lowerlimit=0, inclusive=False), 1) + + x = x.reshape((5, 2)) + assert_equal(stats.tmin(x, lowerlimit=0, inclusive=False), [2, 1]) + assert_equal(stats.tmin(x, axis=1), [0, 2, 4, 6, 8]) + assert_equal(stats.tmin(x, axis=None), 0) + + def test_tmax(self): + x = np.arange(10) + assert_equal(stats.tmax(x), 9) + assert_equal(stats.tmax(x, upperlimit=9),9) + assert_equal(stats.tmax(x, upperlimit=9, inclusive=False), 8) + + x = x.reshape((5, 2)) + assert_equal(stats.tmax(x, upperlimit=9, inclusive=False), [8, 7]) + assert_equal(stats.tmax(x, axis=1), [1, 3, 5, 7, 9]) + assert_equal(stats.tmax(x, axis=None), 9) + + def test_tsem(self): + y = stats.tsem(X, limits=(3, 8), inclusive=(False, True)) + y_ref = np.array([4, 5, 6, 7, 8]) + assert_approx_equal(y, y_ref.std(ddof=1) / np.sqrt(y_ref.size), + significant=self.dprec) + + assert_approx_equal(stats.tsem(X, limits=[-1, 10]), + stats.tsem(X, limits=None), + significant=self.dprec) class TestNanFunc(TestCase): @@ -150,17 +120,17 @@ class TestNanFunc(TestCase): self.Xsomet = self.Xsomet[1:] def test_nanmean_none(self): - """Check nanmean when no values are nan.""" + # Check nanmean when no values are nan. m = stats.nanmean(X) assert_approx_equal(m, X[4]) def test_nanmean_some(self): - """Check nanmean when some values only are nan.""" + # Check nanmean when some values only are nan. m = stats.nanmean(self.Xsome) assert_approx_equal(m, 5.5) def test_nanmean_all(self): - """Check nanmean when all values are nan.""" + # Check nanmean when all values are nan. olderr = np.seterr(all='ignore') try: m = stats.nanmean(self.Xall) @@ -169,17 +139,17 @@ class TestNanFunc(TestCase): assert_(np.isnan(m)) def test_nanstd_none(self): - """Check nanstd when no values are nan.""" + # Check nanstd when no values are nan. s = stats.nanstd(self.X) assert_approx_equal(s, np.std(self.X, ddof=1)) def test_nanstd_some(self): - """Check nanstd when some values only are nan.""" + # Check nanstd when some values only are nan. s = stats.nanstd(self.Xsome) assert_approx_equal(s, np.std(self.Xsomet, ddof=1)) def test_nanstd_all(self): - """Check nanstd when all values are nan.""" + # Check nanstd when all values are nan. olderr = np.seterr(all='ignore') try: s = stats.nanstd(self.Xall) @@ -187,27 +157,31 @@ class TestNanFunc(TestCase): np.seterr(**olderr) assert_(np.isnan(s)) + def test_nanstd_bias_kw(self): + s = stats.nanstd(self.X, bias=True) + assert_approx_equal(s, np.std(self.X, ddof=0)) + def test_nanstd_negative_axis(self): x = np.array([1, 2, 3]) assert_equal(stats.nanstd(x, -1), 1) def test_nanmedian_none(self): - """Check nanmedian when no values are nan.""" + # Check nanmedian when no values are nan. m = stats.nanmedian(self.X) assert_approx_equal(m, np.median(self.X)) def test_nanmedian_some(self): - """Check nanmedian when some values only are nan.""" + # Check nanmedian when some values only are nan. m = stats.nanmedian(self.Xsome) assert_approx_equal(m, np.median(self.Xsomet)) def test_nanmedian_all(self): - """Check nanmedian when all values are nan.""" + # Check nanmedian when all values are nan. m = stats.nanmedian(self.Xall) assert_(np.isnan(m)) def test_nanmedian_scalars(self): - """Check nanmedian for scalar inputs. See ticket #1098.""" + # Check nanmedian for scalar inputs. See ticket #1098. assert_equal(stats.nanmedian(1), np.median(1)) assert_equal(stats.nanmedian(True), np.median(True)) assert_equal(stats.nanmedian(np.array(1)), np.median(np.array(1))) @@ -341,6 +315,16 @@ class TestCorrPearsonr(TestCase): assert_equal(r, -1.0) assert_equal(prob, 0.0) + def test_basic(self): + # A basic test, with a correlation coefficient + # that is not 1 or -1. + a = array([-1, 0, 1]) + b = array([0, 0, 3]) + r, prob = stats.pearsonr(a, b) + assert_approx_equal(r, np.sqrt(3)/2) + assert_approx_equal(prob, 1.0/3) + + class TestFisherExact(TestCase): """Some tests to show that fisher_exact() works correctly. @@ -382,25 +366,23 @@ class TestFisherExact(TestCase): assert_approx_equal(res[0], 4.0 / 56) def test_precise(self): - fisher_exact = stats.fisher_exact - # results from R # # R defines oddsratio differently (see Notes section of fisher_exact # docstring), so those will not match. We leave them in anyway, in # case they will be useful later on. We test only the p-value. tablist = [ - ([[100, 2], [1000, 5]], (2.505583993422285e-001, 1.300759363430016e-001)), - ([[2, 7], [8, 2]], (8.586235135736206e-002, 2.301413756522114e-002)), - ([[5, 1], [10, 10]], (4.725646047336584e+000, 1.973244147157190e-001)), - ([[5, 15], [20, 20]], (3.394396617440852e-001, 9.580440012477637e-002)), - ([[5, 16], [20, 25]], (3.960558326183334e-001, 1.725864953812994e-001)), - ([[10, 5], [10, 1]], (2.116112781158483e-001, 1.973244147157190e-001)), - ([[10, 5], [10, 0]], (0.000000000000000e+000, 6.126482213438734e-002)), - ([[5, 0], [1, 4]], (np.inf, 4.761904761904762e-002)), - ([[0, 5], [1, 4]], (0.000000000000000e+000, 1.000000000000000e+000)), - ([[5, 1], [0, 4]], (np.inf, 4.761904761904758e-002)), - ([[0, 1], [3, 2]], (0.000000000000000e+000, 1.000000000000000e+000)) + ([[100, 2], [1000, 5]], (2.505583993422285e-001, 1.300759363430016e-001)), + ([[2, 7], [8, 2]], (8.586235135736206e-002, 2.301413756522114e-002)), + ([[5, 1], [10, 10]], (4.725646047336584e+000, 1.973244147157190e-001)), + ([[5, 15], [20, 20]], (3.394396617440852e-001, 9.580440012477637e-002)), + ([[5, 16], [20, 25]], (3.960558326183334e-001, 1.725864953812994e-001)), + ([[10, 5], [10, 1]], (2.116112781158483e-001, 1.973244147157190e-001)), + ([[10, 5], [10, 0]], (0.000000000000000e+000, 6.126482213438734e-002)), + ([[5, 0], [1, 4]], (np.inf, 4.761904761904762e-002)), + ([[0, 5], [1, 4]], (0.000000000000000e+000, 1.000000000000000e+000)), + ([[5, 1], [0, 4]], (np.inf, 4.761904761904758e-002)), + ([[0, 1], [3, 2]], (0.000000000000000e+000, 1.000000000000000e+000)) ] for table, res_r in tablist: res = stats.fisher_exact(np.asarray(table)) @@ -581,6 +563,7 @@ class TestCorrSpearmanr(TestCase): r = y[0] assert_approx_equal(r,1.0) + class TestCorrSpearmanrTies(TestCase): """Some tests of tie-handling by the spearmanr function.""" @@ -608,8 +591,6 @@ class TestCorrSpearmanrTies(TestCase): ### I need to figure out how to do this one. -##@dec.knownfailureif(sys.version[:3] < '2.5', "Can't index array with np.int64") -@dec.slow def test_kendalltau(): # with some ties x1 = [12, 2, 1, 12, 2] @@ -646,37 +627,21 @@ def test_kendalltau(): class TestRegression(TestCase): def test_linregressBIGX(self): - """ W.II.F. Regress BIG on X. - - The constant should be 99999990 and the regression coefficient should be 1. - """ + # W.II.F. Regress BIG on X. + # The constant should be 99999990 and the regression coefficient should be 1. y = stats.linregress(X,BIG) intercept = y[1] - r=y[2] + r = y[2] assert_almost_equal(intercept,99999990) assert_almost_equal(r,1.0) -## W.IV.A. Take the NASTY dataset above. Use the variable X as a -## basis for computing polynomials. Namely, compute X1=X, X2=X*X, -## X3=X*X*X, and so on up to 9 products. Use the algebraic -## transformation language within the statistical package itself. You -## will end up with 9 variables. Now regress X1 on X2-X9 (a perfect -## fit). If the package balks (singular or roundoff error messages), -## try X1 on X2-X8, and so on. Most packages cannot handle more than -## a few polynomials. -## Scipy's stats.py does not seem to handle multiple linear regression -## The datasets X1 . . X9 are at the top of the file. - - def test_regressXX(self): - """ W.IV.B. Regress X on X. - - The constant should be exactly 0 and the regression coefficient should be 1. - This is a perfectly valid regression. The program should not complain. - """ + # W.IV.B. Regress X on X. + # The constant should be exactly 0 and the regression coefficient should be 1. + # This is a perfectly valid regression. The program should not complain. y = stats.linregress(X,X) intercept = y[1] - r=y[2] + r = y[2] assert_almost_equal(intercept,0.0) assert_almost_equal(r,1.0) ## W.IV.C. Regress X on BIG and LITTLE (two predictors). The program @@ -687,58 +652,51 @@ class TestRegression(TestCase): ### Need to figure out how to handle multiple linear regression. Not obvious def test_regressZEROX(self): - """ W.IV.D. Regress ZERO on X. - - The program should inform you that ZERO has no variance or it should - go ahead and compute the regression and report a correlation and - total sum of squares of exactly 0. - """ + # W.IV.D. Regress ZERO on X. + # The program should inform you that ZERO has no variance or it should + # go ahead and compute the regression and report a correlation and + # total sum of squares of exactly 0. y = stats.linregress(X,ZERO) intercept = y[1] - r=y[2] + r = y[2] assert_almost_equal(intercept,0.0) assert_almost_equal(r,0.0) def test_regress_simple(self): - """Regress a line with sinusoidal noise.""" + # Regress a line with sinusoidal noise. x = np.linspace(0, 100, 100) y = 0.2 * np.linspace(0, 100, 100) + 10 y += np.sin(np.linspace(0, 20, 100)) res = stats.linregress(x, y) - assert_almost_equal(res[4], 2.3957814497838803e-3) #4.3609875083149268e-3) + assert_almost_equal(res[4], 2.3957814497838803e-3) def test_regress_simple_onearg_rows(self): - """Regress a line with sinusoidal noise, with a single input of shape - (2, N). - """ + # Regress a line w sinusoidal noise, with a single input of shape (2, N). x = np.linspace(0, 100, 100) y = 0.2 * np.linspace(0, 100, 100) + 10 y += np.sin(np.linspace(0, 20, 100)) rows = np.vstack((x, y)) res = stats.linregress(rows) - assert_almost_equal(res[4], 2.3957814497838803e-3) #4.3609875083149268e-3) + assert_almost_equal(res[4], 2.3957814497838803e-3) def test_regress_simple_onearg_cols(self): - """Regress a line with sinusoidal noise, with a single input of shape - (N, 2). - """ x = np.linspace(0, 100, 100) y = 0.2 * np.linspace(0, 100, 100) + 10 y += np.sin(np.linspace(0, 20, 100)) cols = np.hstack((np.expand_dims(x, 1), np.expand_dims(y, 1))) res = stats.linregress(cols) - assert_almost_equal(res[4], 2.3957814497838803e-3) #4.3609875083149268e-3) + assert_almost_equal(res[4], 2.3957814497838803e-3) def test_regress_shape_error(self): - """Check that a single input argument to linregress with wrong shape - results in a ValueError.""" + # Check that a single input argument to linregress with wrong shape + # results in a ValueError. assert_raises(ValueError, stats.linregress, np.ones((3, 3))) def test_linregress(self): - '''compared with multivariate ols with pinv''' + # compared with multivariate ols with pinv x = np.arange(11) y = np.arange(5,16) y[[(1),(-2)]] -= 1 @@ -761,8 +719,8 @@ class TestRegression(TestCase): class TestHistogram(TestCase): - """ Tests that histogram works as it should, and keeps old behaviour - """ + # Tests that histogram works as it should, and keeps old behaviour + # # what is untested: # - multidimensional arrays (since 'a' is ravel'd as the first line in the method) # - very large arrays @@ -770,29 +728,29 @@ class TestHistogram(TestCase): # sample arrays to test the histogram with low_values = np.array([0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.7, 0.8, 0.9, 1.1, 1.2], - dtype=float) # 11 values + dtype=float) # 11 values high_range = np.array([2, 3, 4, 2, 21, 32, 78, 95, 65, 66, 66, 66, 66, 4], - dtype=float) # 14 values + dtype=float) # 14 values low_range = np.array([2, 3, 3, 2, 3, 2.4, 2.1, 3.1, 2.9, 2.6, 2.7, 2.8, 2.2, 2.001], - dtype=float) # 14 values - few_values = np.array([2.0, 3.0, -1.0, 0.0], dtype=float) # 4 values + dtype=float) # 14 values + few_values = np.array([2.0, 3.0, -1.0, 0.0], dtype=float) # 4 values def test_simple(self): - """ Tests that each of the tests works as expected with default params - """ + # Tests that each of the tests works as expected with default params + # # basic tests, with expected results (no weighting) # results taken from the previous (slower) version of histogram - basic_tests = ((self.low_values, (np.array([ 1., 1., 1., 2., 2., - 1., 1., 0., 1., 1.]), + basic_tests = ((self.low_values, (np.array([1., 1., 1., 2., 2., + 1., 1., 0., 1., 1.]), 0.14444444444444446, 0.11111111111111112, 0)), - (self.high_range, (np.array([ 5., 0., 1., 1., 0., - 0., 5., 1., 0., 1.]), + (self.high_range, (np.array([5., 0., 1., 1., 0., + 0., 5., 1., 0., 1.]), -3.1666666666666661, 10.333333333333332, 0)), - (self.low_range, (np.array([ 3., 1., 1., 1., 0., 1., - 1., 2., 3., 1.]), + (self.low_range, (np.array([3., 1., 1., 1., 0., 1., + 1., 2., 3., 1.]), 1.9388888888888889, 0.12222222222222223, 0)), - (self.few_values, (np.array([ 1., 0., 1., 0., 0., 0., - 0., 1., 0., 1.]), + (self.few_values, (np.array([1., 0., 1., 0., 0., 0., + 0., 1., 0., 1.]), -1.2222222222222223, 0.44444444444444448, 0)), ) for inputs, expected_results in basic_tests: @@ -804,23 +762,23 @@ class TestHistogram(TestCase): decimal=2) def test_weighting(self): - """ Tests that weights give expected histograms - """ + # Tests that weights give expected histograms + # basic tests, with expected results, given a set of weights # weights used (first n are used for each test, where n is len of array) (14 values) weights = np.array([1., 3., 4.5, 0.1, -1.0, 0.0, 0.3, 7.0, 103.2, 2, 40, 0, 0, 1]) # results taken from the numpy version of histogram - basic_tests = ((self.low_values, (np.array([ 4.0, 0.0, 4.5, -0.9, 0.0, - 0.3,110.2, 0.0, 0.0, 42.0]), + basic_tests = ((self.low_values, (np.array([4.0, 0.0, 4.5, -0.9, 0.0, + 0.3,110.2, 0.0, 0.0, 42.0]), 0.2, 0.1, 0)), - (self.high_range, (np.array([ 9.6, 0. , -1. , 0. , 0. , - 0. ,145.2, 0. , 0.3, 7. ]), + (self.high_range, (np.array([9.6, 0., -1., 0., 0., + 0.,145.2, 0., 0.3, 7.]), 2.0, 9.3, 0)), - (self.low_range, (np.array([ 2.4, 0. , 0. , 0. , 0. , - 2. , 40. , 0. , 103.2, 13.5]), + (self.low_range, (np.array([2.4, 0., 0., 0., 0., + 2., 40., 0., 103.2, 13.5]), 2.0, 0.11, 0)), - (self.few_values, (np.array([ 4.5, 0. , 0.1, 0. , 0. , 0. , - 0. , 1. , 0. , 3. ]), + (self.few_values, (np.array([4.5, 0., 0.1, 0., 0., 0., + 0., 1., 0., 3.]), -1., 0.4, 0)), ) @@ -837,18 +795,18 @@ class TestHistogram(TestCase): decimal=2) def test_reduced_bins(self): - """ Tests that reducing the number of bins produces expected results - """ + # Tests that reducing the number of bins produces expected results + # basic tests, with expected results (no weighting), # except number of bins is halved to 5 # results taken from the previous (slower) version of histogram - basic_tests = ((self.low_values, (np.array([ 2., 3., 3., 1., 2.]), + basic_tests = ((self.low_values, (np.array([2., 3., 3., 1., 2.]), 0.075000000000000011, 0.25, 0)), - (self.high_range, (np.array([ 5., 2., 0., 6., 1.]), + (self.high_range, (np.array([5., 2., 0., 6., 1.]), -9.625, 23.25, 0)), - (self.low_range, (np.array([ 4., 2., 1., 3., 4.]), + (self.low_range, (np.array([4., 2., 1., 3., 4.]), 1.8625, 0.27500000000000002, 0)), - (self.few_values, (np.array([ 1., 1., 0., 1., 1.]), + (self.few_values, (np.array([1., 1., 0., 1., 1.]), -1.5, 1.0, 0)), ) for inputs, expected_results in basic_tests: @@ -860,30 +818,30 @@ class TestHistogram(TestCase): decimal=2) def test_increased_bins(self): - """ Tests that increasing the number of bins produces expected results - """ + # Tests that increasing the number of bins produces expected results + # basic tests, with expected results (no weighting), # except number of bins is double to 20 # results taken from the previous (slower) version of histogram - basic_tests = ((self.low_values, (np.array([ 1., 0., 1., 0., 1., - 0., 2., 0., 1., 0., - 1., 1., 0., 1., 0., - 0., 0., 1., 0., 1.]), + basic_tests = ((self.low_values, (np.array([1., 0., 1., 0., 1., + 0., 2., 0., 1., 0., + 1., 1., 0., 1., 0., + 0., 0., 1., 0., 1.]), 0.1736842105263158, 0.052631578947368418, 0)), - (self.high_range, (np.array([ 5., 0., 0., 0., 1., - 0., 1., 0., 0., 0., - 0., 0., 0., 5., 0., - 0., 1., 0., 0., 1.]), + (self.high_range, (np.array([5., 0., 0., 0., 1., + 0., 1., 0., 0., 0., + 0., 0., 0., 5., 0., + 0., 1., 0., 0., 1.]), -0.44736842105263142, 4.8947368421052628, 0)), - (self.low_range, (np.array([ 3., 0., 1., 1., 0., 0., - 0., 1., 0., 0., 1., 0., - 1., 0., 1., 0., 1., 3., - 0., 1.]), + (self.low_range, (np.array([3., 0., 1., 1., 0., 0., + 0., 1., 0., 0., 1., 0., + 1., 0., 1., 0., 1., 3., + 0., 1.]), 1.9710526315789474, 0.057894736842105263, 0)), - (self.few_values, (np.array([ 1., 0., 0., 0., 0., 1., - 0., 0., 0., 0., 0., 0., - 0., 0., 1., 0., 0., 0., - 0., 1.]), + (self.few_values, (np.array([1., 0., 0., 0., 0., 1., + 0., 0., 0., 0., 0., 0., + 0., 0., 1., 0., 0., 0., + 0., 1.]), -1.1052631578947367, 0.21052631578947367, 0)), ) for inputs, expected_results in basic_tests: @@ -898,37 +856,29 @@ class TestHistogram(TestCase): def test_cumfreq(): x = [1, 4, 2, 1, 3, 1] cumfreqs, lowlim, binsize, extrapoints = stats.cumfreq(x, numbins=4) - assert_array_almost_equal(cumfreqs, np.array([ 3., 4., 5., 6.])) + assert_array_almost_equal(cumfreqs, np.array([3., 4., 5., 6.])) cumfreqs, lowlim, binsize, extrapoints = stats.cumfreq(x, numbins=4, defaultreallimits=(1.5, 5)) - assert_(extrapoints==3) + assert_(extrapoints == 3) def test_relfreq(): a = np.array([1, 4, 2, 1, 3, 1]) relfreqs, lowlim, binsize, extrapoints = stats.relfreq(a, numbins=4) - assert_array_almost_equal(relfreqs, array([0.5, 0.16666667, 0.16666667, 0.16666667])) + assert_array_almost_equal(relfreqs, + array([0.5, 0.16666667, 0.16666667, 0.16666667])) # check array_like input is accepted - relfreqs2, lowlim, binsize, extrapoints = stats.relfreq([1, 4, 2, 1, 3, 1], numbins=4) + relfreqs2, lowlim, binsize, extrapoints = stats.relfreq([1, 4, 2, 1, 3, 1], + numbins=4) assert_array_almost_equal(relfreqs, relfreqs2) -# Utility - -def compare_results(res,desired): - for i in range(len(desired)): - assert_array_equal(res[i],desired[i]) - - -################################################## -### Test for sum - class TestGMean(TestCase): def test_1D_list(self): a = (1,2,3,4) - actual= stats.gmean(a) + actual = stats.gmean(a) desired = power(1*2*3*4,1./4.) assert_almost_equal(actual, desired,decimal=14) @@ -937,7 +887,7 @@ class TestGMean(TestCase): def test_1D_array(self): a = array((1,2,3,4), float32) - actual= stats.gmean(a) + actual = stats.gmean(a) desired = power(1*2*3*4,1./4.) assert_almost_equal(actual, desired, decimal=7) @@ -948,7 +898,7 @@ class TestGMean(TestCase): a = array(((1,2,3,4), (1,2,3,4), (1,2,3,4))) - actual= stats.gmean(a) + actual = stats.gmean(a) desired = array((1,2,3,4)) assert_array_almost_equal(actual, desired, decimal=14) @@ -959,7 +909,7 @@ class TestGMean(TestCase): a = array(((1,2,3,4), (1,2,3,4), (1,2,3,4))) - actual= stats.gmean(a, axis=1) + actual = stats.gmean(a, axis=1) v = power(1*2*3*4,1./4.) desired = array((v,v,v)) assert_array_almost_equal(actual, desired, decimal=14) @@ -969,19 +919,21 @@ class TestGMean(TestCase): actual = stats.gmean(a) assert_approx_equal(actual, 1e200, significant=14) + class TestHMean(TestCase): def test_1D_list(self): a = (1,2,3,4) - actual= stats.hmean(a) - desired = 4. / (1./1 + 1./2 + 1./3 + 1./4) + actual = stats.hmean(a) + desired = 4. / (1./1 + 1./2 + 1./3 + 1./4) assert_almost_equal(actual, desired, decimal=14) desired1 = stats.hmean(array(a),axis=-1) assert_almost_equal(actual, desired1, decimal=14) + def test_1D_array(self): a = array((1,2,3,4), float64) - actual= stats.hmean(a) - desired = 4. / (1./1 + 1./2 + 1./3 + 1./4) + actual = stats.hmean(a) + desired = 4. / (1./1 + 1./2 + 1./3 + 1./4) assert_almost_equal(actual, desired, decimal=14) desired1 = stats.hmean(a,axis=-1) @@ -1008,7 +960,7 @@ class TestHMean(TestCase): actual1 = stats.hmean(a, axis=1) assert_array_almost_equal(actual1, desired1, decimal=14) -@dec.slow + class TestScoreatpercentile(TestCase): def setUp(self): self.a1 = [3, 4, 5, 10, -3, -5, 6] @@ -1036,8 +988,8 @@ class TestScoreatpercentile(TestCase): assert_equal(scoreatperc(list(range(10)), 50), 4.5) assert_equal(scoreatperc(list(range(10)), 50, (2,7)), 4.5) assert_equal(scoreatperc(list(range(100)), 50, limit=(1, 8)), 4.5) - assert_equal(scoreatperc(np.array([1, 10 ,100]), 50, (10,100)), 55) - assert_equal(scoreatperc(np.array([1, 10 ,100]), 50, (1,10)), 5.5) + assert_equal(scoreatperc(np.array([1, 10,100]), 50, (10,100)), 55) + assert_equal(scoreatperc(np.array([1, 10,100]), 50, (1,10)), 5.5) # explicitly specify interpolation_method 'fraction' (the default) assert_equal(scoreatperc(list(range(10)), 50, interpolation_method='fraction'), @@ -1048,10 +1000,10 @@ class TestScoreatpercentile(TestCase): assert_equal(scoreatperc(list(range(100)), 50, limit=(1, 8), interpolation_method='fraction'), 4.5) - assert_equal(scoreatperc(np.array([1, 10 ,100]), 50, (10, 100), + assert_equal(scoreatperc(np.array([1, 10,100]), 50, (10, 100), interpolation_method='fraction'), 55) - assert_equal(scoreatperc(np.array([1, 10 ,100]), 50, (1,10), + assert_equal(scoreatperc(np.array([1, 10,100]), 50, (1,10), interpolation_method='fraction'), 5.5) @@ -1103,12 +1055,41 @@ class TestScoreatpercentile(TestCase): assert_raises(ValueError, stats.scoreatpercentile, [1], -1) -class TestCMedian(TestCase): - def test_basic(self): - data = [1,2,3,1,5,3,6,4,3,2,4,3,5,2.0] - assert_almost_equal(stats.cmedian(data,5),3.2916666666666665) - assert_almost_equal(stats.cmedian(data,3),3.083333333333333) - assert_almost_equal(stats.cmedian(data),3.0020020020020022) +class TestItemfreq(object): + a = [5, 7, 1, 2, 1, 5, 7] * 10 + b = [1, 2, 5, 7] + + def test_numeric_types(self): + # Check itemfreq works for all dtypes (adapted from np.unique tests) + def _check_itemfreq(dt): + a = np.array(self.a, dt) + v = stats.itemfreq(a) + assert_array_equal(v[:, 0], [1, 2, 5, 7]) + assert_array_equal(v[:, 1], np.array([20, 10, 20, 20], dtype=dt)) + + dtypes = [np.int32, np.int64, np.float32, np.float64, + np.complex64, np.complex128] + for dt in dtypes: + yield _check_itemfreq, dt + + def test_object_arrays(self): + a, b = self.a, self.b + dt = 'O' + aa = np.empty(len(a), dt) + aa[:] = a + bb = np.empty(len(b), dt) + bb[:] = b + v = stats.itemfreq(aa) + assert_array_equal(v[:, 0], bb) + + def test_structured_arrays(self): + a, b = self.a, self.b + dt = [('', 'i'), ('', 'i')] + aa = np.array(list(zip(a, a)), dt) + bb = np.array(list(zip(b, b)), dt) + v = stats.itemfreq(aa) + # Arrays don't compare equal because v[:,0] is object array + assert_equal(v[2, 0], bb[2]) class TestMode(TestCase): @@ -1124,35 +1105,35 @@ class TestVariability(TestCase): testcase = [1,2,3,4] def test_signaltonoise(self): - """ - this is not in R, so used - mean(testcase,axis=0)/(sqrt(var(testcase)*3/4)) """ - #y = stats.signaltonoise(self.shoes[0]) - #assert_approx_equal(y,4.5709967) + # This is not in R, so used: + # mean(testcase, axis=0) / (sqrt(var(testcase) * 3/4)) + + # y = stats.signaltonoise(self.shoes[0]) + # assert_approx_equal(y,4.5709967) y = stats.signaltonoise(self.testcase) assert_approx_equal(y,2.236067977) def test_sem(self): - """ - this is not in R, so used - sqrt(var(testcase)*3/4)/sqrt(3) - """ - #y = stats.sem(self.shoes[0]) - #assert_approx_equal(y,0.775177399) + # This is not in R, so used: + # sqrt(var(testcase)*3/4)/sqrt(3) + + # y = stats.sem(self.shoes[0]) + # assert_approx_equal(y,0.775177399) y = stats.sem(self.testcase) assert_approx_equal(y,0.6454972244) + n = len(self.testcase) + assert_allclose(stats.sem(self.testcase, ddof=0) * np.sqrt(n/(n-2)), + stats.sem(self.testcase, ddof=2)) def test_zmap(self): - """ - not in R, so tested by using - (testcase[i]-mean(testcase,axis=0))/sqrt(var(testcase)*3/4) - """ + # not in R, so tested by using: + # (testcase[i] - mean(testcase, axis=0)) / sqrt(var(testcase) * 3/4) y = stats.zmap(self.testcase,self.testcase) - desired = ([-1.3416407864999, -0.44721359549996 , 0.44721359549996 , 1.3416407864999]) + desired = ([-1.3416407864999, -0.44721359549996, 0.44721359549996, 1.3416407864999]) assert_array_almost_equal(desired,y,decimal=12) def test_zmap_axis(self): - """Test use of 'axis' keyword in zmap.""" + # Test use of 'axis' keyword in zmap. x = np.array([[0.0, 0.0, 1.0, 1.0], [1.0, 1.0, 1.0, 2.0], [2.0, 0.0, 2.0, 0.0]]) @@ -1165,8 +1146,8 @@ class TestVariability(TestCase): z1 = stats.zmap(x, x, axis=1) z0_expected = [[-t1, -t3/2, -t3/2, 0.0], - [0.0, t3, -t3/2, t1], - [t1, -t3/2, t3, -t1]] + [0.0, t3, -t3/2, t1], + [t1, -t3/2, t3, -t1]] z1_expected = [[-1.0, -1.0, 1.0, 1.0], [-t2, -t2, -t2, np.sqrt(3.)], [1.0, -1.0, 1.0, -1.0]] @@ -1175,14 +1156,10 @@ class TestVariability(TestCase): assert_array_almost_equal(z1, z1_expected) def test_zmap_ddof(self): - """Test use of 'ddof' keyword in zmap.""" + # Test use of 'ddof' keyword in zmap. x = np.array([[0.0, 0.0, 1.0, 1.0], [0.0, 1.0, 2.0, 3.0]]) - t1 = 1.0/np.sqrt(2.0/3) - t2 = np.sqrt(3.)/3 - t3 = np.sqrt(2.) - z = stats.zmap(x, x, axis=1, ddof=1) z0_expected = np.array([-0.5, -0.5, 0.5, 0.5])/(1.0/np.sqrt(3)) @@ -1191,16 +1168,14 @@ class TestVariability(TestCase): assert_array_almost_equal(z[1], z1_expected) def test_zscore(self): - """ - not in R, so tested by using - (testcase[i]-mean(testcase,axis=0))/sqrt(var(testcase)*3/4) - """ + # not in R, so tested by using: + # (testcase[i] - mean(testcase, axis=0)) / sqrt(var(testcase) * 3/4) y = stats.zscore(self.testcase) - desired = ([-1.3416407864999, -0.44721359549996 , 0.44721359549996 , 1.3416407864999]) + desired = ([-1.3416407864999, -0.44721359549996, 0.44721359549996, 1.3416407864999]) assert_array_almost_equal(desired,y,decimal=12) def test_zscore_axis(self): - """Test use of 'axis' keyword in zscore.""" + # Test use of 'axis' keyword in zscore. x = np.array([[0.0, 0.0, 1.0, 1.0], [1.0, 1.0, 1.0, 2.0], [2.0, 0.0, 2.0, 0.0]]) @@ -1213,8 +1188,8 @@ class TestVariability(TestCase): z1 = stats.zscore(x, axis=1) z0_expected = [[-t1, -t3/2, -t3/2, 0.0], - [0.0, t3, -t3/2, t1], - [t1, -t3/2, t3, -t1]] + [0.0, t3, -t3/2, t1], + [t1, -t3/2, t3, -t1]] z1_expected = [[-1.0, -1.0, 1.0, 1.0], [-t2, -t2, -t2, np.sqrt(3.)], [1.0, -1.0, 1.0, -1.0]] @@ -1223,14 +1198,10 @@ class TestVariability(TestCase): assert_array_almost_equal(z1, z1_expected) def test_zscore_ddof(self): - """Test use of 'ddof' keyword in zscore.""" + # Test use of 'ddof' keyword in zscore. x = np.array([[0.0, 0.0, 1.0, 1.0], [0.0, 1.0, 2.0, 3.0]]) - t1 = 1.0/np.sqrt(2.0/3) - t2 = np.sqrt(3.)/3 - t3 = np.sqrt(2.) - z = stats.zscore(x, axis=1, ddof=1) z0_expected = np.array([-0.5, -0.5, 0.5, 0.5])/(1.0/np.sqrt(3)) @@ -1250,10 +1221,10 @@ class TestMoments(TestCase): Note that both test cases came from here. """ testcase = [1,2,3,4] - testmathworks = [1.165 , 0.6268, 0.0751, 0.3516, -0.6965] + testmathworks = [1.165, 0.6268, 0.0751, 0.3516, -0.6965] + def test_moment(self): - """ - mean((testcase-mean(testcase))**power,axis=0),axis=0))**power))""" + # mean((testcase-mean(testcase))**power,axis=0),axis=0))**power)) y = stats.moment(self.testcase,1) assert_approx_equal(y,0.0,10) y = stats.moment(self.testcase,2) @@ -1262,19 +1233,15 @@ class TestMoments(TestCase): assert_approx_equal(y,0.0) y = stats.moment(self.testcase,4) assert_approx_equal(y,2.5625) + def test_variation(self): - """ - variation = samplestd/mean """ -## y = stats.variation(self.shoes[0]) -## assert_approx_equal(y,21.8770668) + # variation = samplestd / mean y = stats.variation(self.testcase) assert_approx_equal(y,0.44721359549996, 10) def test_skewness(self): - """ - sum((testmathworks-mean(testmathworks,axis=0))**3,axis=0)/ - ((sqrt(var(testmathworks)*4/5))**3)/5 - """ + # sum((testmathworks-mean(testmathworks,axis=0))**3,axis=0) / + # ((sqrt(var(testmathworks)*4/5))**3)/5 y = stats.skew(self.testmathworks) assert_approx_equal(y,-0.29322304336607,10) y = stats.skew(self.testmathworks,bias=0) @@ -1283,18 +1250,14 @@ class TestMoments(TestCase): assert_approx_equal(y,0.0,10) def test_skewness_scalar(self): - """ - `skew` must return a scalar for 1-dim input - """ + # `skew` must return a scalar for 1-dim input assert_equal(stats.skew(arange(10)), 0.0) def test_kurtosis(self): - """ - sum((testcase-mean(testcase,axis=0))**4,axis=0)/((sqrt(var(testcase)*3/4))**4)/4 - sum((test2-mean(testmathworks,axis=0))**4,axis=0)/((sqrt(var(testmathworks)*4/5))**4)/5 - Set flags for axis = 0 and - fisher=0 (Pearson's defn of kurtosis for compatiability with Matlab) - """ + # sum((testcase-mean(testcase,axis=0))**4,axis=0)/((sqrt(var(testcase)*3/4))**4)/4 + # sum((test2-mean(testmathworks,axis=0))**4,axis=0)/((sqrt(var(testmathworks)*4/5))**4)/5 + # Set flags for axis = 0 and + # fisher=0 (Pearson's defn of kurtosis for compatiability with Matlab) y = stats.kurtosis(self.testmathworks,0,fisher=0,bias=1) assert_approx_equal(y, 2.1658856802973,10) @@ -1310,6 +1273,7 @@ class TestMoments(TestCase): def test_kurtosis_array_scalar(self): assert_equal(type(stats.kurtosis([1,2,3])), float) + class TestThreshold(TestCase): def test_basic(self): a = [-1,2,3,4,5,-1,-2] @@ -1321,7 +1285,7 @@ class TestThreshold(TestCase): assert_array_equal(stats.threshold(a,2,4,0), [0,2,3,4,0,0,0]) -# Hypothesis test tests + class TestStudentTest(TestCase): X1 = np.array([-1, 0, 1]) X2 = np.array([0, 1, 2]) @@ -1330,9 +1294,10 @@ class TestStudentTest(TestCase): T1_1 = -1.732051 P1_1 = 0.2254033 T1_2 = -3.464102 - P1_2 = 0.0741799 + P1_2 = 0.0741799 T2_0 = 1.732051 P2_0 = 0.2254033 + def test_onesample(self): t, p = stats.ttest_1samp(self.X1, 0) @@ -1393,7 +1358,7 @@ def test_percentileofscore(): 4, kind=kind), \ 30 - #larger numbers + # larger numbers for (kind, result) in [('mean', 35.0), ('strict', 30.0), ('weak', 40.0)]: @@ -1408,13 +1373,12 @@ def test_percentileofscore(): pcos([10, 20, 30, 40, 40, 40, 50, 60, 70, 80], 40, kind=kind), result - for kind in ('rank', 'mean', 'strict', 'weak'): yield assert_equal, \ pcos([10, 20, 30, 50, 60, 70, 80, 90, 100, 110], 40, kind=kind), 30.0 - #boundaries + # boundaries for (kind, result) in [('rank', 10.0), ('mean', 5.0), ('strict', 0.0), @@ -1431,7 +1395,7 @@ def test_percentileofscore(): pcos([10, 20, 30, 50, 60, 70, 80, 90, 100, 110], 110, kind=kind), result - #out of bounds + # out of bounds for (kind, score, result) in [('rank', 200, 100.0), ('mean', 200, 100.0), ('mean', 0, 0.0)]: @@ -1439,12 +1403,343 @@ def test_percentileofscore(): pcos([10, 20, 30, 50, 60, 70, 80, 90, 100, 110], score, kind=kind), result + assert_raises(ValueError, pcos, [1, 2, 3, 3, 4], 3, kind='unrecognized') + + +PowerDivCase = namedtuple('Case', ['f_obs', 'f_exp', 'ddof', 'axis', + 'chi2', # Pearson's + 'log', # G-test (log-likelihood) + 'mod_log', # Modified log-likelihood + 'cr', # Cressie-Read (lambda=2/3) + ]) + +# The details of the first two elements in power_div_1d_cases are used +# in a test in TestPowerDivergence. Check that code before making +# any changes here. +power_div_1d_cases = [ + # Use the default f_exp. + PowerDivCase(f_obs=[4, 8, 12, 8], f_exp=None, ddof=0, axis=None, + chi2=4, + log=2*(4*np.log(4/8) + 12*np.log(12/8)), + mod_log=2*(8*np.log(8/4) + 8*np.log(8/12)), + cr=(4*((4/8)**(2/3) - 1) + 12*((12/8)**(2/3) - 1))/(5/9)), + # Give a non-uniform f_exp. + PowerDivCase(f_obs=[4, 8, 12, 8], f_exp=[2, 16, 12, 2], ddof=0, axis=None, + chi2=24, + log=2*(4*np.log(4/2) + 8*np.log(8/16) + 8*np.log(8/2)), + mod_log=2*(2*np.log(2/4) + 16*np.log(16/8) + 2*np.log(2/8)), + cr=(4*((4/2)**(2/3) - 1) + 8*((8/16)**(2/3) - 1) + + 8*((8/2)**(2/3) - 1))/(5/9)), + # f_exp is a scalar. + PowerDivCase(f_obs=[4, 8, 12, 8], f_exp=8, ddof=0, axis=None, + chi2=4, + log=2*(4*np.log(4/8) + 12*np.log(12/8)), + mod_log=2*(8*np.log(8/4) + 8*np.log(8/12)), + cr=(4*((4/8)**(2/3) - 1) + 12*((12/8)**(2/3) - 1))/(5/9)), + # f_exp equal to f_obs. + PowerDivCase(f_obs=[3, 5, 7, 9], f_exp=[3, 5, 7, 9], ddof=0, axis=0, + chi2=0, log=0, mod_log=0, cr=0), +] + + +power_div_empty_cases = [ + # Shape is (0,)--a data set with length 0. The computed + # test statistic should be 0. + PowerDivCase(f_obs=[], + f_exp=None, ddof=0, axis=0, + chi2=0, log=0, mod_log=0, cr=0), + # Shape is (0, 3). This is 3 data sets, but each data set has + # length 0, so the computed test statistic should be [0, 0, 0]. + PowerDivCase(f_obs=np.array([[],[],[]]).T, + f_exp=None, ddof=0, axis=0, + chi2=[0, 0, 0], + log=[0, 0, 0], + mod_log=[0, 0, 0], + cr=[0, 0, 0]), + # Shape is (3, 0). This represents an empty collection of + # data sets in which each data set has length 3. The test + # statistic should be an empty array. + PowerDivCase(f_obs=np.array([[],[],[]]), + f_exp=None, ddof=0, axis=0, + chi2=[], + log=[], + mod_log=[], + cr=[]), +] + + +class TestPowerDivergence(object): + + def check_power_divergence(self, f_obs, f_exp, ddof, axis, lambda_, + expected_stat): + f_obs = np.asarray(f_obs) + if axis is None: + num_obs = f_obs.size + else: + b = np.broadcast(f_obs, f_exp) + num_obs = b.shape[axis] + stat, p = stats.power_divergence(f_obs=f_obs, f_exp=f_exp, ddof=ddof, + axis=axis, lambda_=lambda_) + assert_allclose(stat, expected_stat) + + if lambda_ == 1 or lambda_ == "pearson": + # Also test stats.chisquare. + stat, p = stats.chisquare(f_obs=f_obs, f_exp=f_exp, ddof=ddof, + axis=axis) + assert_allclose(stat, expected_stat) + + ddof = np.asarray(ddof) + expected_p = stats.chisqprob(expected_stat, num_obs - 1 - ddof) + assert_allclose(p, expected_p) + + def test_basic(self): + for case in power_div_1d_cases: + yield (self.check_power_divergence, + case.f_obs, case.f_exp, case.ddof, case.axis, + None, case.chi2) + yield (self.check_power_divergence, + case.f_obs, case.f_exp, case.ddof, case.axis, + "pearson", case.chi2) + yield (self.check_power_divergence, + case.f_obs, case.f_exp, case.ddof, case.axis, + 1, case.chi2) + yield (self.check_power_divergence, + case.f_obs, case.f_exp, case.ddof, case.axis, + "log-likelihood", case.log) + yield (self.check_power_divergence, + case.f_obs, case.f_exp, case.ddof, case.axis, + "mod-log-likelihood", case.mod_log) + yield (self.check_power_divergence, + case.f_obs, case.f_exp, case.ddof, case.axis, + "cressie-read", case.cr) + yield (self.check_power_divergence, + case.f_obs, case.f_exp, case.ddof, case.axis, + 2/3, case.cr) + + def test_basic_masked(self): + for case in power_div_1d_cases: + mobs = np.ma.array(case.f_obs) + yield (self.check_power_divergence, + mobs, case.f_exp, case.ddof, case.axis, + None, case.chi2) + yield (self.check_power_divergence, + mobs, case.f_exp, case.ddof, case.axis, + "pearson", case.chi2) + yield (self.check_power_divergence, + mobs, case.f_exp, case.ddof, case.axis, + 1, case.chi2) + yield (self.check_power_divergence, + mobs, case.f_exp, case.ddof, case.axis, + "log-likelihood", case.log) + yield (self.check_power_divergence, + mobs, case.f_exp, case.ddof, case.axis, + "mod-log-likelihood", case.mod_log) + yield (self.check_power_divergence, + mobs, case.f_exp, case.ddof, case.axis, + "cressie-read", case.cr) + yield (self.check_power_divergence, + mobs, case.f_exp, case.ddof, case.axis, + 2/3, case.cr) + + def test_axis(self): + case0 = power_div_1d_cases[0] + case1 = power_div_1d_cases[1] + f_obs = np.vstack((case0.f_obs, case1.f_obs)) + f_exp = np.vstack((np.ones_like(case0.f_obs)*np.mean(case0.f_obs), + case1.f_exp)) + # Check the four computational code paths in power_divergence + # using a 2D array with axis=1. + yield (self.check_power_divergence, + f_obs, f_exp, 0, 1, + "pearson", [case0.chi2, case1.chi2]) + yield (self.check_power_divergence, + f_obs, f_exp, 0, 1, + "log-likelihood", [case0.log, case1.log]) + yield (self.check_power_divergence, + f_obs, f_exp, 0, 1, + "mod-log-likelihood", [case0.mod_log, case1.mod_log]) + yield (self.check_power_divergence, + f_obs, f_exp, 0, 1, + "cressie-read", [case0.cr, case1.cr]) + # Reshape case0.f_obs to shape (2,2), and use axis=None. + # The result should be the same. + yield (self.check_power_divergence, + np.array(case0.f_obs).reshape(2, 2), None, 0, None, + "pearson", case0.chi2) + + def test_ddof_broadcasting(self): + # Test that ddof broadcasts correctly. + # ddof does not affect the test statistic. It is broadcast + # with the computed test statistic for the computation of + # the p value. + + case0 = power_div_1d_cases[0] + case1 = power_div_1d_cases[1] + # Create 4x2 arrays of observed and expected frequencies. + f_obs = np.vstack((case0.f_obs, case1.f_obs)).T + f_exp = np.vstack((np.ones_like(case0.f_obs)*np.mean(case0.f_obs), + case1.f_exp)).T + + expected_chi2 = [case0.chi2, case1.chi2] + + # ddof has shape (2, 1). This is broadcast with the computed + # statistic, so p will have shape (2,2). + ddof = np.array([[0], [1]]) + + stat, p = stats.power_divergence(f_obs, f_exp, ddof=ddof) + assert_allclose(stat, expected_chi2) + + # Compute the p values separately, passing in scalars for ddof. + stat0, p0 = stats.power_divergence(f_obs, f_exp, ddof=ddof[0,0]) + stat1, p1 = stats.power_divergence(f_obs, f_exp, ddof=ddof[1,0]) + + assert_array_equal(p, np.vstack((p0, p1))) + + def test_empty_cases(self): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + for case in power_div_empty_cases: + yield (self.check_power_divergence, + case.f_obs, case.f_exp, case.ddof, case.axis, + "pearson", case.chi2) + yield (self.check_power_divergence, + case.f_obs, case.f_exp, case.ddof, case.axis, + "log-likelihood", case.log) + yield (self.check_power_divergence, + case.f_obs, case.f_exp, case.ddof, case.axis, + "mod-log-likelihood", case.mod_log) + yield (self.check_power_divergence, + case.f_obs, case.f_exp, case.ddof, case.axis, + "cressie-read", case.cr) + + +def test_chisquare_masked_arrays(): + # Test masked arrays. + obs = np.array([[8, 8, 16, 32, -1], [-1, -1, 3, 4, 5]]).T + mask = np.array([[0, 0, 0, 0, 1], [1, 1, 0, 0, 0]]).T + mobs = np.ma.masked_array(obs, mask) + expected_chisq = np.array([24.0, 0.5]) + expected_g = np.array([2*(2*8*np.log(0.5) + 32*np.log(2.0)), + 2*(3*np.log(0.75) + 5*np.log(1.25))]) + + chisq, p = stats.chisquare(mobs) + mat.assert_array_equal(chisq, expected_chisq) + mat.assert_array_almost_equal(p, stats.chisqprob(expected_chisq, + mobs.count(axis=0) - 1)) + + g, p = stats.power_divergence(mobs, lambda_='log-likelihood') + mat.assert_array_almost_equal(g, expected_g, decimal=15) + mat.assert_array_almost_equal(p, stats.chisqprob(expected_g, + mobs.count(axis=0) - 1)) + + chisq, p = stats.chisquare(mobs.T, axis=1) + mat.assert_array_equal(chisq, expected_chisq) + mat.assert_array_almost_equal(p, + stats.chisqprob(expected_chisq, + mobs.T.count(axis=1) - 1)) + + g, p = stats.power_divergence(mobs.T, axis=1, lambda_="log-likelihood") + mat.assert_array_almost_equal(g, expected_g, decimal=15) + mat.assert_array_almost_equal(p, stats.chisqprob(expected_g, + mobs.count(axis=0) - 1)) + + obs1 = np.ma.array([3, 5, 6, 99, 10], mask=[0, 0, 0, 1, 0]) + exp1 = np.ma.array([2, 4, 8, 10, 99], mask=[0, 0, 0, 0, 1]) + chi2, p = stats.chisquare(obs1, f_exp=exp1) + # Because of the mask at index 3 of obs1 and at index 4 of exp1, + # only the first three elements are included in the calculation + # of the statistic. + mat.assert_array_equal(chi2, 1/2 + 1/4 + 4/8) + + # When axis=None, the two values should have type np.float64. + chisq, p = stats.chisquare(np.ma.array([1,2,3]), axis=None) + assert_(isinstance(chisq, np.float64)) + assert_(isinstance(p, np.float64)) + assert_equal(chisq, 1.0) + assert_almost_equal(p, stats.chisqprob(1.0, 2)) + + # Empty arrays: + # A data set with length 0 returns a masked scalar. + chisq, p = stats.chisquare(np.ma.array([])) + assert_(isinstance(chisq, np.ma.MaskedArray)) + assert_equal(chisq.shape, ()) + assert_(chisq.mask) + + empty3 = np.ma.array([[],[],[]]) + + # empty3 is a collection of 0 data sets (whose lengths would be 3, if + # there were any), so the return value is an array with length 0. + chisq, p = stats.chisquare(empty3) + assert_(isinstance(chisq, np.ma.MaskedArray)) + mat.assert_array_equal(chisq, []) + + # empty3.T is an array containing 3 data sets, each with length 0, + # so an array of size (3,) is returned, with all values masked. + chisq, p = stats.chisquare(empty3.T) + assert_(isinstance(chisq, np.ma.MaskedArray)) + assert_equal(chisq.shape, (3,)) + assert_(np.all(chisq.mask)) + + +def test_power_divergence_against_cressie_read_data(): + # Test stats.power_divergence against tables 4 and 5 from + # Cressie and Read, "Multimonial Goodness-of-Fit Tests", + # J. R. Statist. Soc. B (1984), Vol 46, No. 3, pp. 440-464. + # This tests the calculation for several values of lambda. + + # `table4` holds just the second and third columns from Table 4. + table4 = np.array([ + # observed, expected, + 15, 15.171, + 11, 13.952, + 14, 12.831, + 17, 11.800, + 5, 10.852, + 11, 9.9796, + 10, 9.1777, + 4, 8.4402, + 8, 7.7620, + 10, 7.1383, + 7, 6.5647, + 9, 6.0371, + 11, 5.5520, + 3, 5.1059, + 6, 4.6956, + 1, 4.3183, + 1, 3.9713, + 4, 3.6522, + ]).reshape(-1, 2) + table5 = np.array([ + # lambda, statistic + -10.0, 72.2e3, + -5.0, 28.9e1, + -3.0, 65.6, + -2.0, 40.6, + -1.5, 34.0, + -1.0, 29.5, + -0.5, 26.5, + 0.0, 24.6, + 0.5, 23.4, + 0.67, 23.1, + 1.0, 22.7, + 1.5, 22.6, + 2.0, 22.9, + 3.0, 24.8, + 5.0, 35.5, + 10.0, 21.4e1, + ]).reshape(-1, 2) + + for lambda_, expected_stat in table5: + stat, p = stats.power_divergence(table4[:,0], table4[:,1], + lambda_=lambda_) + assert_allclose(stat, expected_stat, rtol=5e-3) + def test_friedmanchisquare(): # see ticket:113 # verified with matlab and R - #From Demsar "Statistical Comparisons of Classifiers over Multiple Data Sets" - #2006, Xf=9.28 (no tie handling, tie corrected Xf >=9.28) + # From Demsar "Statistical Comparisons of Classifiers over Multiple Data Sets" + # 2006, Xf=9.28 (no tie handling, tie corrected Xf >=9.28) x1 = [array([0.763, 0.599, 0.954, 0.628, 0.882, 0.936, 0.661, 0.583, 0.775, 1.0, 0.94, 0.619, 0.972, 0.957]), array([0.768, 0.591, 0.971, 0.661, 0.888, 0.931, 0.668, 0.583, @@ -1454,72 +1749,78 @@ def test_friedmanchisquare(): array([0.798, 0.569, 0.967, 0.657, 0.898, 0.931, 0.685, 0.625, 0.875, 1.0, 0.962, 0.669, 0.975, 0.970])] - #From "Bioestadistica para las ciencias de la salud" Xf=18.95 p<0.001: + # From "Bioestadistica para las ciencias de la salud" Xf=18.95 p<0.001: x2 = [array([4,3,5,3,5,3,2,5,4,4,4,3]), array([2,2,1,2,3,1,2,3,2,1,1,3]), array([2,4,3,3,4,3,3,4,4,1,2,1]), array([3,5,4,3,4,4,3,3,3,4,4,4])] - #From Jerrorl H. Zar, "Biostatistical Analysis"(example 12.6), Xf=10.68, 0.005 < p < 0.01: - #Probability from this example is inexact using Chisquare aproximation of Friedman Chisquare. + # From Jerrorl H. Zar, "Biostatistical Analysis"(example 12.6), Xf=10.68, 0.005 < p < 0.01: + # Probability from this example is inexact using Chisquare aproximation of Friedman Chisquare. x3 = [array([7.0,9.9,8.5,5.1,10.3]), array([5.3,5.7,4.7,3.5,7.7]), array([4.9,7.6,5.5,2.8,8.4]), array([8.8,8.9,8.1,3.3,9.1])] - - assert_array_almost_equal(stats.friedmanchisquare(x1[0],x1[1],x1[2],x1[3]),(10.2283464566929, 0.0167215803284414)) - assert_array_almost_equal(stats.friedmanchisquare(x2[0],x2[1],x2[2],x2[3]),(18.9428571428571, 0.000280938375189499)) - assert_array_almost_equal(stats.friedmanchisquare(x3[0],x3[1],x3[2],x3[3]),(10.68, 0.0135882729582176)) + assert_array_almost_equal(stats.friedmanchisquare(x1[0],x1[1],x1[2],x1[3]), + (10.2283464566929, 0.0167215803284414)) + assert_array_almost_equal(stats.friedmanchisquare(x2[0],x2[1],x2[2],x2[3]), + (18.9428571428571, 0.000280938375189499)) + assert_array_almost_equal(stats.friedmanchisquare(x3[0],x3[1],x3[2],x3[3]), + (10.68, 0.0135882729582176)) np.testing.assert_raises(ValueError, stats.friedmanchisquare,x3[0],x3[1]) # test using mstats - assert_array_almost_equal(stats.mstats.friedmanchisquare(x1[0],x1[1],x1[2],x1[3]),(10.2283464566929, 0.0167215803284414)) + assert_array_almost_equal(stats.mstats.friedmanchisquare(x1[0],x1[1],x1[2],x1[3]), + (10.2283464566929, 0.0167215803284414)) # the following fails - #assert_array_almost_equal(stats.mstats.friedmanchisquare(x2[0],x2[1],x2[2],x2[3]),(18.9428571428571, 0.000280938375189499)) - assert_array_almost_equal(stats.mstats.friedmanchisquare(x3[0],x3[1],x3[2],x3[3]),(10.68, 0.0135882729582176)) + # assert_array_almost_equal(stats.mstats.friedmanchisquare(x2[0],x2[1],x2[2],x2[3]), + # (18.9428571428571, 0.000280938375189499)) + assert_array_almost_equal(stats.mstats.friedmanchisquare(x3[0],x3[1],x3[2],x3[3]), + (10.68, 0.0135882729582176)) np.testing.assert_raises(ValueError,stats.mstats.friedmanchisquare,x3[0],x3[1]) + def test_kstest(): - #from numpy.testing import assert_almost_equal + # from numpy.testing import assert_almost_equal # comparing with values from R x = np.linspace(-1,1,9) D,p = stats.kstest(x,'norm') - assert_almost_equal( D, 0.15865525393145705, 12) - assert_almost_equal( p, 0.95164069201518386, 1) + assert_almost_equal(D, 0.15865525393145705, 12) + assert_almost_equal(p, 0.95164069201518386, 1) x = np.linspace(-15,15,9) D,p = stats.kstest(x,'norm') - assert_almost_equal( D, 0.44435602715924361, 15) - assert_almost_equal( p, 0.038850140086788665, 8) + assert_almost_equal(D, 0.44435602715924361, 15) + assert_almost_equal(p, 0.038850140086788665, 8) # the following tests rely on deterministicaly replicated rvs np.random.seed(987654321) x = stats.norm.rvs(loc=0.2, size=100) D,p = stats.kstest(x, 'norm', mode='asymp') - assert_almost_equal( D, 0.12464329735846891, 15) - assert_almost_equal( p, 0.089444888711820769, 15) - assert_almost_equal( np.array(stats.kstest(x, 'norm', mode='asymp')), + assert_almost_equal(D, 0.12464329735846891, 15) + assert_almost_equal(p, 0.089444888711820769, 15) + assert_almost_equal(np.array(stats.kstest(x, 'norm', mode='asymp')), np.array((0.12464329735846891, 0.089444888711820769)), 15) - assert_almost_equal( np.array(stats.kstest(x,'norm', alternative = 'less')), + assert_almost_equal(np.array(stats.kstest(x,'norm', alternative='less')), np.array((0.12464329735846891, 0.040989164077641749)), 15) # this 'greater' test fails with precision of decimal=14 - assert_almost_equal( np.array(stats.kstest(x,'norm', alternative = 'greater')), + assert_almost_equal(np.array(stats.kstest(x,'norm', alternative='greater')), np.array((0.0072115233216310994, 0.98531158590396228)), 12) - #missing: no test that uses *args + # missing: no test that uses *args def test_ks_2samp(): - #exact small sample solution + # exact small sample solution data1 = np.array([1.0,2.0]) data2 = np.array([1.0,2.0,3.0]) assert_almost_equal(np.array(stats.ks_2samp(data1+0.01,data2)), np.array((0.33333333333333337, 0.99062316386915694))) assert_almost_equal(np.array(stats.ks_2samp(data1-0.01,data2)), np.array((0.66666666666666674, 0.42490954988801982))) - #these can also be verified graphically + # these can also be verified graphically assert_almost_equal( np.array(stats.ks_2samp(np.linspace(1,100,100), np.linspace(1,100,100)+2+0.1)), @@ -1528,7 +1829,7 @@ def test_ks_2samp(): np.array(stats.ks_2samp(np.linspace(1,100,100), np.linspace(1,100,100)+2-0.1)), np.array((0.020000000000000018, 0.99999999999999933))) - #these are just regression tests + # these are just regression tests assert_almost_equal( np.array(stats.ks_2samp(np.linspace(1,100,100), np.linspace(1,100,110)+20.1)), @@ -1538,8 +1839,9 @@ def test_ks_2samp(): np.linspace(1,100,110)+20-0.1)), np.array((0.20818181818181825, 0.017981441789762638))) + def test_ttest_rel(): - #regression test + # regression test tr,pr = 0.81248591389165692, 0.41846234511362157 tpr = ([tr,-tr],[pr,pr]) @@ -1555,7 +1857,7 @@ def test_ttest_rel(): t,p = stats.ttest_rel(rvs1_2D, rvs2_2D, axis=1) assert_array_almost_equal([t,p],tpr) - #test on 3 dimensions + # test on 3 dimensions rvs1_3D = np.dstack([rvs1_2D,rvs1_2D,rvs1_2D]) rvs2_3D = np.dstack([rvs2_2D,rvs2_2D,rvs2_2D]) t,p = stats.ttest_rel(rvs1_3D, rvs2_3D, axis=1) @@ -1570,20 +1872,25 @@ def test_ttest_rel(): olderr = np.seterr(all='ignore') try: - #test zero division problem + # test zero division problem t,p = stats.ttest_rel([0,0,0],[1,1,1]) assert_equal((np.abs(t),p), (np.inf, 0)) assert_equal(stats.ttest_rel([0,0,0], [0,0,0]), (np.nan, np.nan)) - #check that nan in input array result in nan output + # check that nan in input array result in nan output anan = np.array([[1,np.nan],[-1,1]]) assert_equal(stats.ttest_ind(anan, np.zeros((2,2))),([0, np.nan], [1,np.nan])) finally: np.seterr(**olderr) + # test incorrect input shape raise an error + x = np.arange(24) + assert_raises(ValueError, stats.ttest_rel, x.reshape((8, 3)), + x.reshape((2, 3, 4))) + def test_ttest_ind(): - #regression test + # regression test tr = 1.0912746897927283 pr = 0.27647818616351882 tpr = ([tr,-tr],[pr,pr]) @@ -1600,7 +1907,7 @@ def test_ttest_ind(): t,p = stats.ttest_ind(rvs1_2D, rvs2_2D, axis=1) assert_array_almost_equal([t,p],tpr) - #test on 3 dimensions + # test on 3 dimensions rvs1_3D = np.dstack([rvs1_2D,rvs1_2D,rvs1_2D]) rvs2_3D = np.dstack([rvs2_2D,rvs2_2D,rvs2_2D]) t,p = stats.ttest_ind(rvs1_3D, rvs2_3D, axis=1) @@ -1615,34 +1922,34 @@ def test_ttest_ind(): olderr = np.seterr(all='ignore') try: - #test zero division problem + # test zero division problem t,p = stats.ttest_ind([0,0,0],[1,1,1]) assert_equal((np.abs(t),p), (np.inf, 0)) assert_equal(stats.ttest_ind([0,0,0], [0,0,0]), (np.nan, np.nan)) - #check that nan in input array result in nan output + # check that nan in input array result in nan output anan = np.array([[1,np.nan],[-1,1]]) assert_equal(stats.ttest_ind(anan, np.zeros((2,2))),([0, np.nan], [1,np.nan])) finally: np.seterr(**olderr) -def test_ttest_ind_with_uneq_var(): +def test_ttest_ind_with_uneq_var(): # check vs. R a = (1, 2, 3) b = (1.1, 2.9, 4.2) pr = 0.53619490753126731 tr = -0.68649512735572582 - t, p = stats.ttest_ind(a, b, equal_var = False) + t, p = stats.ttest_ind(a, b, equal_var=False) assert_array_almost_equal([t,p], [tr, pr]) a = (1, 2, 3, 4) pr = 0.84354139131608286 tr = -0.2108663315950719 - t, p = stats.ttest_ind(a, b, equal_var = False) + t, p = stats.ttest_ind(a, b, equal_var=False) assert_array_almost_equal([t,p], [tr, pr]) - #regression test + # regression test tr = 1.0912746897927283 tr_uneq_n = 0.66745638708050492 pr = 0.27647831993021388 @@ -1655,49 +1962,50 @@ def test_ttest_ind_with_uneq_var(): rvs1_2D = np.array([rvs1, rvs2]) rvs2_2D = np.array([rvs2, rvs1]) - t,p = stats.ttest_ind(rvs1, rvs2, axis=0, equal_var = False) + t,p = stats.ttest_ind(rvs1, rvs2, axis=0, equal_var=False) assert_array_almost_equal([t,p],(tr,pr)) - t,p = stats.ttest_ind(rvs1, rvs3, axis =0, equal_var = False) + t,p = stats.ttest_ind(rvs1, rvs3, axis=0, equal_var=False) assert_array_almost_equal([t,p], (tr_uneq_n, pr_uneq_n)) - t,p = stats.ttest_ind(rvs1_2D.T, rvs2_2D.T, axis=0, equal_var = False) + t,p = stats.ttest_ind(rvs1_2D.T, rvs2_2D.T, axis=0, equal_var=False) assert_array_almost_equal([t,p],tpr) - t,p = stats.ttest_ind(rvs1_2D, rvs2_2D, axis=1, equal_var = False) + t,p = stats.ttest_ind(rvs1_2D, rvs2_2D, axis=1, equal_var=False) assert_array_almost_equal([t,p],tpr) - #test on 3 dimensions + # test on 3 dimensions rvs1_3D = np.dstack([rvs1_2D,rvs1_2D,rvs1_2D]) rvs2_3D = np.dstack([rvs2_2D,rvs2_2D,rvs2_2D]) - t,p = stats.ttest_ind(rvs1_3D, rvs2_3D, axis=1, equal_var = False) + t,p = stats.ttest_ind(rvs1_3D, rvs2_3D, axis=1, equal_var=False) assert_almost_equal(np.abs(t), np.abs(tr)) assert_array_almost_equal(np.abs(p), pr) assert_equal(t.shape, (2, 3)) t,p = stats.ttest_ind(np.rollaxis(rvs1_3D,2), np.rollaxis(rvs2_3D,2), - axis=2, equal_var = False) + axis=2, equal_var=False) assert_array_almost_equal(np.abs(t), np.abs(tr)) assert_array_almost_equal(np.abs(p), pr) assert_equal(t.shape, (3, 2)) olderr = np.seterr(all='ignore') try: - #test zero division problem - t,p = stats.ttest_ind([0,0,0],[1,1,1], equal_var = False) + # test zero division problem + t,p = stats.ttest_ind([0,0,0],[1,1,1], equal_var=False) assert_equal((np.abs(t),p), (np.inf, 0)) - assert_equal(stats.ttest_ind([0,0,0], [0,0,0], equal_var = False), (np.nan, np.nan)) + assert_equal(stats.ttest_ind([0,0,0], [0,0,0], equal_var=False), (np.nan, np.nan)) - #check that nan in input array result in nan output + # check that nan in input array result in nan output anan = np.array([[1,np.nan],[-1,1]]) - assert_equal(stats.ttest_ind(anan, np.zeros((2,2)), equal_var = False), + assert_equal(stats.ttest_ind(anan, np.zeros((2,2)), equal_var=False), ([0, np.nan], [1,np.nan])) finally: np.seterr(**olderr) + def test_ttest_1samp_new(): n1, n2, n3 = (10,15,20) rvn1 = stats.norm.rvs(loc=5,scale=10,size=(n1,n2,n3)) - #check multidimensional array and correct axis handling - #deterministic rvn1 and rvn2 would be better as in test_ttest_rel + # check multidimensional array and correct axis handling + # deterministic rvn1 and rvn2 would be better as in test_ttest_rel t1,p1 = stats.ttest_1samp(rvn1[:,:,:], np.ones((n2,n3)),axis=0) t2,p2 = stats.ttest_1samp(rvn1[:,:,:], 1,axis=0) t3,p3 = stats.ttest_1samp(rvn1[:,0,0], 1) @@ -1721,12 +2029,12 @@ def test_ttest_1samp_new(): olderr = np.seterr(all='ignore') try: - #test zero division problem + # test zero division problem t,p = stats.ttest_1samp([0,0,0], 1) assert_equal((np.abs(t),p), (np.inf, 0)) assert_equal(stats.ttest_1samp([0,0,0], 0), (np.nan, np.nan)) - #check that nan in input array result in nan output + # check that nan in input array result in nan output anan = np.array([[1,np.nan],[-1,1]]) assert_equal(stats.ttest_1samp(anan, 0),([0, np.nan], [1,np.nan])) finally: @@ -1735,9 +2043,9 @@ def test_ttest_1samp_new(): def test_describe(): x = np.vstack((np.ones((3,4)),2*np.ones((2,4)))) - nc, mmc = (5, ([ 1., 1., 1., 1.], [ 2., 2., 2., 2.])) - mc = np.array([ 1.4, 1.4, 1.4, 1.4]) - vc = np.array([ 0.3, 0.3, 0.3, 0.3]) + nc, mmc = (5, ([1., 1., 1., 1.], [2., 2., 2., 2.])) + mc = np.array([1.4, 1.4, 1.4, 1.4]) + vc = np.array([0.3, 0.3, 0.3, 0.3]) skc = [0.40824829046386357]*4 kurtc = [-1.833333333333333]*4 n, mm, m, v, sk, kurt = stats.describe(x) @@ -1745,70 +2053,77 @@ def test_describe(): assert_equal(mm, mmc) assert_equal(m, mc) assert_equal(v, vc) - assert_array_almost_equal(sk, skc, decimal=13) #not sure about precision + assert_array_almost_equal(sk, skc, decimal=13) # not sure about precision assert_array_almost_equal(kurt, kurtc, decimal=13) n, mm, m, v, sk, kurt = stats.describe(x.T, axis=1) assert_equal(n, nc) assert_equal(mm, mmc) assert_equal(m, mc) assert_equal(v, vc) - assert_array_almost_equal(sk, skc, decimal=13) #not sure about precision + assert_array_almost_equal(sk, skc, decimal=13) # not sure about precision assert_array_almost_equal(kurt, kurtc, decimal=13) + def test_normalitytests(): # numbers verified with R: dagoTest in package fBasics st_normal, st_skew, st_kurt = (3.92371918, 1.98078826, -0.01403734) - pv_normal, pv_skew, pv_kurt = (0.14059673, 0.04761502, 0.98880019) + pv_normal, pv_skew, pv_kurt = (0.14059673, 0.04761502, 0.98880019) x = np.array((-2,-1,0,1,2,3)*4)**2 yield assert_array_almost_equal, stats.normaltest(x), (st_normal, pv_normal) yield assert_array_almost_equal, stats.skewtest(x), (st_skew, pv_skew) yield assert_array_almost_equal, stats.kurtosistest(x), (st_kurt, pv_kurt) + # Test axis=None (equal to axis=0 for 1-D input) + yield (assert_array_almost_equal, stats.normaltest(x, axis=None), + (st_normal, pv_normal)) + yield (assert_array_almost_equal, stats.skewtest(x, axis=None), + (st_skew, pv_skew)) + yield (assert_array_almost_equal, stats.kurtosistest(x, axis=None), + (st_kurt, pv_kurt)) -#class TestJarqueBera(TestCase): -# def test_jarque_bera_stats(self): -# np.random.seed(987654321) -# x = np.random.normal(0, 1, 100000) -# y = np.random.chisquare(10000, 100000) -# z = np.random.rayleigh(1, 100000) -# -# assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(y)[1]) -# assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(z)[1]) -# assert_(stats.jarque_bera(y)[1] > stats.jarque_bera(z)[1]) -# -# def test_jarque_bera_array_like(self): -# np.random.seed(987654321) -# x = np.random.normal(0, 1, 100000) -# -# JB1, p1 = stats.jarque_bera(list(x)) -# JB2, p2 = stats.jarque_bera(tuple(x)) -# JB3, p3 = stats.jarque_bera(x.reshape(2, 50000)) -# -# assert_(JB1 == JB2 == JB3) -# assert_(p1 == p2 == p3) -# -# def test_jarque_bera_size(self): -# assert_raises(ValueError, stats.jarque_bera, []) +class TestJarqueBera(TestCase): + def test_jarque_bera_stats(self): + np.random.seed(987654321) + x = np.random.normal(0, 1, 100000) + y = np.random.chisquare(10000, 100000) + z = np.random.rayleigh(1, 100000) -def test_skewtest_too_few_samples(): - """Regression test for ticket #1492. + assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(y)[1]) + assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(z)[1]) + assert_(stats.jarque_bera(y)[1] > stats.jarque_bera(z)[1]) - skewtest requires at least 8 samples; 7 should raise a ValueError. - """ + def test_jarque_bera_array_like(self): + np.random.seed(987654321) + x = np.random.normal(0, 1, 100000) + + JB1, p1 = stats.jarque_bera(list(x)) + JB2, p2 = stats.jarque_bera(tuple(x)) + JB3, p3 = stats.jarque_bera(x.reshape(2, 50000)) + + assert_(JB1 == JB2 == JB3) + assert_(p1 == p2 == p3) + + def test_jarque_bera_size(self): + assert_raises(ValueError, stats.jarque_bera, []) + + +def test_skewtest_too_few_samples(): + # Regression test for ticket #1492. + # skewtest requires at least 8 samples; 7 should raise a ValueError. x = np.arange(7.0) assert_raises(ValueError, stats.skewtest, x) -def test_kurtosistest_too_few_samples(): - """Regression test for ticket #1425. - kurtosistest requires at least 5 samples; 4 should raise a ValueError. - """ +def test_kurtosistest_too_few_samples(): + # Regression test for ticket #1425. + # kurtosistest requires at least 5 samples; 4 should raise a ValueError. x = np.arange(4.0) assert_raises(ValueError, stats.kurtosistest, x) -def mannwhitneyu(): - x = np.array([ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + +def test_mannwhitneyu(): + x = np.array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., @@ -1825,7 +2140,7 @@ def mannwhitneyu(): 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]) - y = np.array([ 1., 1., 1., 1., 1., 1., 1., 2., 1., 2., 1., 1., 1., + y = np.array([1., 1., 1., 1., 1., 1., 1., 2., 1., 2., 1., 1., 1., 1., 2., 1., 1., 1., 2., 1., 1., 1., 1., 1., 2., 1., 1., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 2., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., @@ -1836,14 +2151,14 @@ def mannwhitneyu(): 1., 2., 1., 1., 2., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1.]) - #p-value verified with matlab and R to 5 significant digits + # p-value verified with matlab and R to 5 significant digits assert_array_almost_equal(stats.stats.mannwhitneyu(x,y), (16980.5, 2.8214327656317373e-005), decimal=12) - def test_pointbiserial(): - # copied from mstats tests removing nans + # same as mstats test except for the nan + # Test data: http://support.sas.com/ctx/samples/index.jsp?sid=490&tab=output x = [1,0,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0, 0,0,0,0,1] y = [14.8,13.8,12.4,10.1,7.1,6.1,5.8,4.6,4.3,3.5,3.3,3.2,3.0, @@ -1853,79 +2168,117 @@ def test_pointbiserial(): def test_obrientransform(): - #this is a regression test to check np.var replacement - #I didn't separately verigy the numbers + # A couple tests calculated by hand. + x1 = np.array([0, 2, 4]) + t1 = stats.obrientransform(x1) + expected = [7, -2, 7] + assert_allclose(t1[0], expected) + + x2 = np.array([0, 3, 6, 9]) + t2 = stats.obrientransform(x2) + expected = np.array([30, 0, 0, 30]) + assert_allclose(t2[0], expected) + + # Test two arguments. + a, b = stats.obrientransform(x1, x2) + assert_equal(a, t1[0]) + assert_equal(b, t2[0]) + + # Test three arguments. + a, b, c = stats.obrientransform(x1, x2, x1) + assert_equal(a, t1[0]) + assert_equal(b, t2[0]) + assert_equal(c, t1[0]) + + # This is a regression test to check np.var replacement. + # The author of this test didn't separately verify the numbers. x1 = np.arange(5) result = np.array( - [[ 5.41666667, 1.04166667, -0.41666667, 1.04166667, 5.41666667], - [ 21.66666667, 4.16666667, -1.66666667, 4.16666667, 21.66666667]]) + [[5.41666667, 1.04166667, -0.41666667, 1.04166667, 5.41666667], + [21.66666667, 4.16666667, -1.66666667, 4.16666667, 21.66666667]]) assert_array_almost_equal(stats.obrientransform(x1, 2*x1), result, decimal=8) + # Example from "O'Brien Test for Homogeneity of Variance" + # by Herve Abdi. + values = range(5, 11) + reps = np.array([5, 11, 9, 3, 2, 2]) + data = np.repeat(values, reps) + transformed_values = np.array([3.1828, 0.5591, 0.0344, + 1.6086, 5.2817, 11.0538]) + expected = np.repeat(transformed_values, reps) + result = stats.obrientransform(data) + assert_array_almost_equal(result[0], expected, decimal=4) + class HarMeanTestCase: def test_1dlist(self): - ''' Test a 1d list''' - a=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100] + # Test a 1d list + a = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] b = 34.1417152147 self.do(a, b) + def test_1darray(self): - ''' Test a 1d array''' - a=np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) + # Test a 1d array + a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) b = 34.1417152147 self.do(a, b) + def test_1dma(self): - ''' Test a 1d masked array''' - a=np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) + # Test a 1d masked array + a = np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) b = 34.1417152147 self.do(a, b) + def test_1dmavalue(self): - ''' Test a 1d masked array with a masked value''' - a=np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], + # Test a 1d masked array with a masked value + a = np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], mask=[0,0,0,0,0,0,0,0,0,1]) b = 31.8137186141 self.do(a, b) # Note the next tests use axis=None as default, not axis=0 def test_2dlist(self): - ''' Test a 2d list''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + # Test a 2d list + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] b = 38.6696271841 self.do(a, b) + def test_2darray(self): - ''' Test a 2d array''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + # Test a 2d array + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] b = 38.6696271841 self.do(np.array(a), b) + def test_2dma(self): - ''' Test a 2d masked array''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + # Test a 2d masked array + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] b = 38.6696271841 self.do(np.ma.array(a), b) + def test_2daxis0(self): - ''' Test a 2d list with axis=0''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.array([ 22.88135593, 39.13043478, 52.90076336, 65.45454545]) + # Test a 2d list with axis=0 + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + b = np.array([22.88135593, 39.13043478, 52.90076336, 65.45454545]) self.do(a, b, axis=0) + def test_2daxis1(self): - ''' Test a 2d list with axis=1''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.array([ 19.2 , 63.03939962, 103.80078637]) + # Test a 2d list with axis=1 + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + b = np.array([19.2, 63.03939962, 103.80078637]) self.do(a, b, axis=1) + def test_2dmatrixdaxis0(self): - ''' Test a 2d list with axis=0''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.matrix([[ 22.88135593, 39.13043478, 52.90076336, 65.45454545]]) + # Test a 2d list with axis=0 + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + b = np.matrix([[22.88135593, 39.13043478, 52.90076336, 65.45454545]]) self.do(np.matrix(a), b, axis=0) + def test_2dmatrixaxis1(self): - ''' Test a 2d list with axis=1''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.matrix([[ 19.2 , 63.03939962, 103.80078637]]).T + # Test a 2d list with axis=1 + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + b = np.matrix([[19.2, 63.03939962, 103.80078637]]).T self.do(np.matrix(a), b, axis=1) -## def test_dtype(self): -## ''' Test a 1d list with a new dtype''' -## a=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100] -## b = 34.1417152147 -## self.do(a, b, dtype=np.float128) # does not work on Win32 + class TestHarMean(HarMeanTestCase, TestCase): def do(self, a, b, axis=None, dtype=None): @@ -1933,73 +2286,79 @@ class TestHarMean(HarMeanTestCase, TestCase): assert_almost_equal(b, x) assert_equal(x.dtype, dtype) + class GeoMeanTestCase: def test_1dlist(self): - ''' Test a 1d list''' - a=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100] + # Test a 1d list + a = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] b = 45.2872868812 self.do(a, b) + def test_1darray(self): - ''' Test a 1d array''' - a=np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) + # Test a 1d array + a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) b = 45.2872868812 self.do(a, b) + def test_1dma(self): - ''' Test a 1d masked array''' - a=np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) + # Test a 1d masked array + a = np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) b = 45.2872868812 self.do(a, b) + def test_1dmavalue(self): - ''' Test a 1d masked array with a masked value''' - a=np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], mask=[0,0,0,0,0,0,0,0,0,1]) + # Test a 1d masked array with a masked value + a = np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], mask=[0,0,0,0,0,0,0,0,0,1]) b = 41.4716627439 self.do(a, b) # Note the next tests use axis=None as default, not axis=0 def test_2dlist(self): - ''' Test a 2d list''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + # Test a 2d list + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] b = 52.8885199 self.do(a, b) + def test_2darray(self): - ''' Test a 2d array''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + # Test a 2d array + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] b = 52.8885199 self.do(np.array(a), b) + def test_2dma(self): - ''' Test a 2d masked array''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + # Test a 2d masked array + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] b = 52.8885199 self.do(np.ma.array(a), b) + def test_2daxis0(self): - ''' Test a 2d list with axis=0''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.array([35.56893304, 49.32424149, 61.3579244 , 72.68482371]) + # Test a 2d list with axis=0 + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + b = np.array([35.56893304, 49.32424149, 61.3579244, 72.68482371]) self.do(a, b, axis=0) + def test_2daxis1(self): - ''' Test a 2d list with axis=1''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.array([ 22.13363839, 64.02171746, 104.40086817]) + # Test a 2d list with axis=1 + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + b = np.array([22.13363839, 64.02171746, 104.40086817]) self.do(a, b, axis=1) + def test_2dmatrixdaxis0(self): - ''' Test a 2d list with axis=0''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.matrix([[35.56893304, 49.32424149, 61.3579244 , 72.68482371]]) + # Test a 2d list with axis=0 + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + b = np.matrix([[35.56893304, 49.32424149, 61.3579244, 72.68482371]]) self.do(np.matrix(a), b, axis=0) + def test_2dmatrixaxis1(self): - ''' Test a 2d list with axis=1''' - a=[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] - b = np.matrix([[ 22.13363839, 64.02171746, 104.40086817]]).T + # Test a 2d list with axis=1 + a = [[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]] + b = np.matrix([[22.13363839, 64.02171746, 104.40086817]]).T self.do(np.matrix(a), b, axis=1) -## def test_dtype(self): -## ''' Test a 1d list with a new dtype''' -## a=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100] -## b = 45.2872868812 -## self.do(a, b, dtype=np.float128) # does not exist on win32 + def test_1dlist0(self): - ''' Test a 1d list with zero element''' - a=[10, 20, 30, 40, 50, 60, 70, 80, 90, 0] - b = 0.0 # due to exp(-inf)=0 + # Test a 1d list with zero element + a = [10, 20, 30, 40, 50, 60, 70, 80, 90, 0] + b = 0.0 # due to exp(-inf)=0 olderr = np.seterr(all='ignore') try: self.do(a, b) @@ -2007,9 +2366,9 @@ class GeoMeanTestCase: np.seterr(**olderr) def test_1darray0(self): - ''' Test a 1d array with zero element''' - a=np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 0]) - b = 0.0 # due to exp(-inf)=0 + # Test a 1d array with zero element + a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 0]) + b = 0.0 # due to exp(-inf)=0 olderr = np.seterr(all='ignore') try: self.do(a, b) @@ -2017,8 +2376,8 @@ class GeoMeanTestCase: np.seterr(**olderr) def test_1dma0(self): - ''' Test a 1d masked array with zero element''' - a=np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 0]) + # Test a 1d masked array with zero element + a = np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 0]) b = 41.4716627439 olderr = np.seterr(all='ignore') try: @@ -2027,8 +2386,8 @@ class GeoMeanTestCase: np.seterr(**olderr) def test_1dmainf(self): - ''' Test a 1d masked array with negative element''' - a=np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, -1]) + # Test a 1d masked array with negative element + a = np.ma.array([10, 20, 30, 40, 50, 60, 70, 80, 90, -1]) b = 41.4716627439 olderr = np.seterr(all='ignore') try: @@ -2036,9 +2395,10 @@ class GeoMeanTestCase: finally: np.seterr(**olderr) + class TestGeoMean(GeoMeanTestCase, TestCase): def do(self, a, b, axis=None, dtype=None): - #Note this doesn't test when axis is not specified + # Note this doesn't test when axis is not specified x = stats.gmean(a, axis=axis, dtype=dtype) assert_almost_equal(b, x) assert_equal(x.dtype, dtype) @@ -2046,7 +2406,7 @@ class TestGeoMean(GeoMeanTestCase, TestCase): def test_binomtest(): # precision tests compared to R for ticket:986 - pp = np.concatenate(( np.linspace(0.1,0.2,5), np.linspace(0.45,0.65,5), + pp = np.concatenate((np.linspace(0.1,0.2,5), np.linspace(0.45,0.65,5), np.linspace(0.85,0.95,5))) n = 501 x = 450 @@ -2060,12 +2420,120 @@ def test_binomtest(): for p, res in zip(pp,results): assert_approx_equal(stats.binom_test(x, n, p), res, - significant=12, err_msg='fail forp=%f'%p) + significant=12, err_msg='fail forp=%f' % p) assert_approx_equal(stats.binom_test(50,100,0.1), 5.8320387857343647e-024, - significant=12, err_msg='fail forp=%f'%p) - -class Test_Trim(object): + significant=12, err_msg='fail forp=%f' % p) + + +def test_binomtest2(): + # test added for issue #2384 + res2 = [ + [1.0, 1.0], + [0.5,1.0,0.5], + [0.25,1.00,1.00,0.25], + [0.125,0.625,1.000,0.625,0.125], + [0.0625,0.3750,1.0000,1.0000,0.3750,0.0625], + [0.03125,0.21875,0.68750,1.00000,0.68750,0.21875,0.03125], + [0.015625,0.125000,0.453125,1.000000,1.000000,0.453125,0.125000,0.015625], + [0.0078125,0.0703125,0.2890625,0.7265625,1.0000000,0.7265625,0.2890625, + 0.0703125,0.0078125], + [0.00390625,0.03906250,0.17968750,0.50781250,1.00000000,1.00000000, + 0.50781250,0.17968750,0.03906250,0.00390625], + [0.001953125,0.021484375,0.109375000,0.343750000,0.753906250,1.000000000, + 0.753906250,0.343750000,0.109375000,0.021484375,0.001953125] + ] + + for k in range(1, 11): + res1 = [stats.binom_test(v, k, 0.5) for v in range(k + 1)] + assert_almost_equal(res1, res2[k-1], decimal=10) + + +def test_binomtest3(): + # test added for issue #2384 + # test when x == n*p and neighbors + res3 = [stats.binom_test(v, v*k, 1./k) for v in range(1, 11) + for k in range(2, 11)] + assert_equal(res3, np.ones(len(res3), int)) + + #> bt=c() + #> for(i in as.single(1:10)){for(k in as.single(2:10)){bt = c(bt, binom.test(i-1, k*i,(1/k))$p.value); print(c(i+1, k*i,(1/k)))}} + binom_testm1 = np.array([ + 0.5, 0.5555555555555556, 0.578125, 0.5904000000000003, + 0.5981224279835393, 0.603430543396034, 0.607304096221924, + 0.610255656871054, 0.612579511000001, 0.625, 0.670781893004115, + 0.68853759765625, 0.6980101120000006, 0.703906431368616, + 0.70793209416498, 0.7108561134173507, 0.713076544331419, + 0.714820192935702, 0.6875, 0.7268709038256367, 0.7418963909149174, + 0.74986110468096, 0.7548015520398076, 0.7581671424768577, + 0.760607984787832, 0.762459425024199, 0.7639120677676575, 0.7265625, + 0.761553963657302, 0.774800934828818, 0.7818005980538996, + 0.78613491480358, 0.789084353140195, 0.7912217659828884, + 0.79284214559524, 0.794112956558801, 0.75390625, 0.7856929451142176, + 0.7976688481430754, 0.8039848974727624, 0.807891868948366, + 0.8105487660137676, 0.812473307174702, 0.8139318233591120, + 0.815075399104785, 0.7744140625, 0.8037322594985427, + 0.814742863657656, 0.8205425178645808, 0.8241275984172285, + 0.8265645374416, 0.8283292196088257, 0.829666291102775, + 0.8307144686362666, 0.7905273437499996, 0.8178712053954738, + 0.828116983756619, 0.833508948940494, 0.8368403871552892, + 0.839104213210105, 0.840743186196171, 0.84198481438049, + 0.8429580531563676, 0.803619384765625, 0.829338573944648, + 0.8389591907548646, 0.84401876783902, 0.84714369697889, + 0.8492667010581667, 0.850803474598719, 0.851967542858308, + 0.8528799045949524, 0.8145294189453126, 0.838881732845347, + 0.847979024541911, 0.852760894015685, 0.8557134656773457, + 0.8577190131799202, 0.85917058278431, 0.860270010472127, + 0.861131648404582, 0.823802947998047, 0.846984756807511, + 0.855635653643743, 0.860180994825685, 0.86298688573253, + 0.864892525675245, 0.866271647085603, 0.867316125625004, + 0.8681346531755114 + ]) + + # > bt=c() + # > for(i in as.single(1:10)){for(k in as.single(2:10)){bt = c(bt, binom.test(i+1, k*i,(1/k))$p.value); print(c(i+1, k*i,(1/k)))}} + + binom_testp1 = np.array([ + 0.5, 0.259259259259259, 0.26171875, 0.26272, 0.2632244513031551, + 0.2635138663069203, 0.2636951804161073, 0.2638162407564354, + 0.2639010709000002, 0.625, 0.4074074074074074, 0.42156982421875, + 0.4295746560000003, 0.43473045988554, 0.4383309503172684, + 0.4409884859402103, 0.4430309389962837, 0.444649849401104, 0.6875, + 0.4927602499618962, 0.5096031427383425, 0.5189636628480, + 0.5249280070771274, 0.5290623300865124, 0.5320974248125793, + 0.5344204730474308, 0.536255847400756, 0.7265625, 0.5496019313526808, + 0.5669248746708034, 0.576436455045805, 0.5824538812831795, + 0.5866053321547824, 0.589642781414643, 0.5919618019300193, + 0.593790427805202, 0.75390625, 0.590868349763505, 0.607983393277209, + 0.617303847446822, 0.623172512167948, 0.627208862156123, + 0.6301556891501057, 0.632401894928977, 0.6341708982290303, + 0.7744140625, 0.622562037497196, 0.639236102912278, 0.648263335014579, + 0.65392850011132, 0.657816519817211, 0.660650782947676, + 0.662808780346311, 0.6645068560246006, 0.7905273437499996, + 0.6478843304312477, 0.6640468318879372, 0.6727589686071775, + 0.6782129857784873, 0.681950188903695, 0.684671508668418, + 0.686741824999918, 0.688369886732168, 0.803619384765625, + 0.668716055304315, 0.684360013879534, 0.6927642396829181, + 0.6980155964704895, 0.701609591890657, 0.7042244320992127, + 0.7062125081341817, 0.707775152962577, 0.8145294189453126, + 0.686243374488305, 0.7013873696358975, 0.709501223328243, + 0.714563595144314, 0.718024953392931, 0.7205416252126137, + 0.722454130389843, 0.723956813292035, 0.823802947998047, + 0.701255953767043, 0.715928221686075, 0.723772209289768, + 0.7286603031173616, 0.7319999279787631, 0.7344267920995765, + 0.736270323773157, 0.737718376096348 + ]) + + res4_p1 = [stats.binom_test(v+1, v*k, 1./k) for v in range(1, 11) + for k in range(2, 11)] + res4_m1 = [stats.binom_test(v-1, v*k, 1./k) for v in range(1, 11) + for k in range(2, 11)] + + assert_almost_equal(res4_p1, binom_testp1, decimal=13) + assert_almost_equal(res4_m1, binom_testm1, decimal=13) + + +class TestTrim(object): # test trim functions def test_trim1(self): a = np.arange(11) @@ -2081,26 +2549,52 @@ class Test_Trim(object): assert_equal(stats.trimboth(np.arange(24).reshape(6,4), 0.2), np.arange(4,20).reshape(4,4)) assert_equal(stats.trimboth(np.arange(24).reshape(4,6).T, 2/6.), - np.array([[ 2, 8, 14, 20],[ 3, 9, 15, 21]])) + np.array([[2, 8, 14, 20],[3, 9, 15, 21]])) assert_raises(ValueError, stats.trimboth, np.arange(24).reshape(4,6).T, 4/6.) def test_trim_mean(self): - assert_equal(stats.trim_mean(np.arange(24).reshape(4,6).T, 2/6.), - np.array([ 2.5, 8.5, 14.5, 20.5])) - assert_equal(stats.trim_mean(np.arange(24).reshape(4,6), 2/6.), - np.array([ 9., 10., 11., 12., 13., 14.])) - assert_equal(stats.trim_mean(np.arange(24), 2/6.), 11.5) + # don't use pre-sorted arrays + a = np.array([4, 8, 2, 0, 9, 5, 10, 1, 7, 3, 6]) + idx = np.array([3, 5, 0, 1, 2, 4]) + a2 = np.arange(24).reshape(6, 4)[idx, :] + a3 = np.arange(24).reshape(6, 4, order='F')[idx, :] + assert_equal(stats.trim_mean(a3, 2/6.), + np.array([2.5, 8.5, 14.5, 20.5])) + assert_equal(stats.trim_mean(a2, 2/6.), + np.array([10., 11., 12., 13.])) + idx4 = np.array([1, 0, 3, 2]) + a4 = np.arange(24).reshape(4, 6)[idx4, :] + assert_equal(stats.trim_mean(a4, 2/6.), + np.array([9., 10., 11., 12., 13., 14.])) + # shuffled arange(24) as array_like + a = [7, 11, 12, 21, 16, 6, 22, 1, 5, 0, 18, 10, 17, 9, 19, 15, 23, + 20, 2, 14, 4, 13, 8, 3] + assert_equal(stats.trim_mean(a, 2/6.), 11.5) assert_equal(stats.trim_mean([5,4,3,1,2,0], 2/6.), 2.5) + # check axis argument + np.random.seed(1234) + a = np.random.randint(20, size=(5, 6, 4, 7)) + for axis in [0, 1, 2, 3, -1]: + res1 = stats.trim_mean(a, 2/6., axis=axis) + res2 = stats.trim_mean(np.rollaxis(a, axis), 2/6.) + assert_equal(res1, res2) + + res1 = stats.trim_mean(a, 2/6., axis=None) + res2 = stats.trim_mean(a.ravel(), 2/6.) + assert_equal(res1, res2) + + assert_raises(ValueError, stats.trim_mean, a, 0.6) + class TestSigamClip(object): def test_sigmaclip1(self): a = np.concatenate((np.linspace(9.5,10.5,31),np.linspace(0,20,5))) - fact = 4 #default + fact = 4 # default c, low, upp = stats.sigmaclip(a) - assert_(c.min()>low) - assert_(c.max() low) + assert_(c.max() < upp) assert_equal(low, c.mean() - fact*c.std()) assert_equal(upp, c.mean() + fact*c.std()) assert_equal(c.size, a.size) @@ -2109,19 +2603,19 @@ class TestSigamClip(object): a = np.concatenate((np.linspace(9.5,10.5,31),np.linspace(0,20,5))) fact = 1.5 c, low, upp = stats.sigmaclip(a, fact, fact) - assert_(c.min()>low) - assert_(c.max() low) + assert_(c.max() < upp) assert_equal(low, c.mean() - fact*c.std()) assert_equal(upp, c.mean() + fact*c.std()) assert_equal(c.size, 4) - assert_equal(a.size, 36) #check original array unchanged + assert_equal(a.size, 36) # check original array unchanged def test_sigmaclip3(self): a = np.concatenate((np.linspace(9.5,10.5,11),np.linspace(-100,-50,3))) fact = 1.8 c, low, upp = stats.sigmaclip(a, fact, fact) - assert_(c.min()>low) - assert_(c.max() low) + assert_(c.max() < upp) assert_equal(low, c.mean() - fact*c.std()) assert_equal(upp, c.mean() + fact*c.std()) assert_equal(c, np.linspace(9.5,10.5,11)) @@ -2130,12 +2624,12 @@ class TestSigamClip(object): class TestFOneWay(TestCase): def test_trivial(self): - """A trivial test of stats.f_oneway, with F=0.""" + # A trivial test of stats.f_oneway, with F=0. F, p = stats.f_oneway([0,2], [0,2]) assert_equal(F, 0.0) def test_basic(self): - """A test of stats.f_oneway, with F=2.""" + # A test of stats.f_oneway, with F=2. F, p = stats.f_oneway([0,2], [2,4]) # Despite being a floating point calculation, this data should # result in F being exactly 2.0. @@ -2145,7 +2639,6 @@ class TestFOneWay(TestCase): class TestKruskal(TestCase): def test_simple(self): - """A really simple case for stats.kruskal""" x = [1] y = [2] h, p = stats.kruskal(x, y) @@ -2156,7 +2649,6 @@ class TestKruskal(TestCase): assert_approx_equal(p, stats.chisqprob(h, 1)) def test_basic(self): - """A basic test, with no ties.""" x = [1, 3, 5, 7, 9] y = [2, 4, 6, 8, 10] h, p = stats.kruskal(x, y) @@ -2167,7 +2659,6 @@ class TestKruskal(TestCase): assert_approx_equal(p, stats.chisqprob(3./11, 1)) def test_simple_tie(self): - """A simple case with a tie.""" x = [1] y = [1, 2] h_uncorr = 1.5**2 + 2*2.25**2 - 12 @@ -2179,7 +2670,6 @@ class TestKruskal(TestCase): assert_equal(h, expected) def test_another_tie(self): - """Another test of stats.kruskal with a tie.""" x = [1, 1, 1, 2] y = [2, 2, 2, 2] h_uncorr = (12. / 8. / 9.) * 4 * (3**2 + 6**2) - 3 * 9 @@ -2189,7 +2679,7 @@ class TestKruskal(TestCase): assert_approx_equal(h, expected) def test_three_groups(self): - """A test of stats.kruskal with three groups, with ties.""" + # A test of stats.kruskal with three groups, with ties. x = [1, 1, 1] y = [2, 2, 2] z = [2, 2] diff --git a/pywafo/src/wafo/stats/tests/test_tukeylambda_stats.py b/pywafo/src/wafo/stats/tests/test_tukeylambda_stats.py new file mode 100644 index 0000000..9d3d654 --- /dev/null +++ b/pywafo/src/wafo/stats/tests/test_tukeylambda_stats.py @@ -0,0 +1,91 @@ +from __future__ import division, print_function, absolute_import + +import numpy as np +from numpy.testing import assert_allclose, assert_equal, run_module_suite + +from scipy.stats._tukeylambda_stats import tukeylambda_variance, \ + tukeylambda_kurtosis + + +def test_tukeylambda_stats_known_exact(): + """Compare results with some known exact formulas.""" + # Some exact values of the Tukey Lambda variance and kurtosis: + # lambda var kurtosis + # 0 pi**2/3 6/5 (logistic distribution) + # 0.5 4 - pi (5/3 - pi/2)/(pi/4 - 1)**2 - 3 + # 1 1/3 -6/5 (uniform distribution on (-1,1)) + # 2 1/12 -6/5 (uniform distribution on (-1/2, 1/2)) + + # lambda = 0 + var = tukeylambda_variance(0) + assert_allclose(var, np.pi**2 / 3, atol=1e-12) + kurt = tukeylambda_kurtosis(0) + assert_allclose(kurt, 1.2, atol=1e-10) + + # lambda = 0.5 + var = tukeylambda_variance(0.5) + assert_allclose(var, 4 - np.pi, atol=1e-12) + kurt = tukeylambda_kurtosis(0.5) + desired = (5./3 - np.pi/2) / (np.pi/4 - 1)**2 - 3 + assert_allclose(kurt, desired, atol=1e-10) + + # lambda = 1 + var = tukeylambda_variance(1) + assert_allclose(var, 1.0 / 3, atol=1e-12) + kurt = tukeylambda_kurtosis(1) + assert_allclose(kurt, -1.2, atol=1e-10) + + # lambda = 2 + var = tukeylambda_variance(2) + assert_allclose(var, 1.0 / 12, atol=1e-12) + kurt = tukeylambda_kurtosis(2) + assert_allclose(kurt, -1.2, atol=1e-10) + + +def test_tukeylambda_stats_mpmath(): + """Compare results with some values that were computed using mpmath.""" + a10 = dict(atol=1e-10, rtol=0) + a12 = dict(atol=1e-12, rtol=0) + data = [ + # lambda variance kurtosis + [-0.1, 4.78050217874253547, 3.78559520346454510], + [-0.0649, 4.16428023599895777, 2.52019675947435718], + [-0.05, 3.93672267890775277, 2.13129793057777277], + [-0.001, 3.30128380390964882, 1.21452460083542988], + [0.001, 3.27850775649572176, 1.18560634779287585], + [0.03125, 2.95927803254615800, 0.804487555161819980], + [0.05, 2.78281053405464501, 0.611604043886644327], + [0.0649, 2.65282386754100551, 0.476834119532774540], + [1.2, 0.242153920578588346, -1.23428047169049726], + [10.0, 0.00095237579757703597, 2.37810697355144933], + [20.0, 0.00012195121951131043, 7.37654321002709531], + ] + + for lam, var_expected, kurt_expected in data: + var = tukeylambda_variance(lam) + assert_allclose(var, var_expected, **a12) + kurt = tukeylambda_kurtosis(lam) + assert_allclose(kurt, kurt_expected, **a10) + + # Test with vector arguments (most of the other tests are for single + # values). + lam, var_expected, kurt_expected = zip(*data) + var = tukeylambda_variance(lam) + assert_allclose(var, var_expected, **a12) + kurt = tukeylambda_kurtosis(lam) + assert_allclose(kurt, kurt_expected, **a10) + + +def test_tukeylambda_stats_invalid(): + """Test values of lambda outside the domains of the functions.""" + lam = [-1.0, -0.5] + var = tukeylambda_variance(lam) + assert_equal(var, np.array([np.nan, np.inf])) + + lam = [-1.0, -0.25] + kurt = tukeylambda_kurtosis(lam) + assert_equal(kurt, np.array([np.nan, np.inf])) + + +if __name__ == "__main__": + run_module_suite() diff --git a/pywafo/src/wafo/stats/vonmises.py b/pywafo/src/wafo/stats/vonmises.py new file mode 100644 index 0000000..753bf6b --- /dev/null +++ b/pywafo/src/wafo/stats/vonmises.py @@ -0,0 +1,47 @@ +from __future__ import division, print_function, absolute_import + +import numpy as np +import scipy.stats +from scipy.special import i0 + + +def von_mises_cdf_series(k,x,p): + x = float(x) + s = np.sin(x) + c = np.cos(x) + sn = np.sin(p*x) + cn = np.cos(p*x) + R = 0 + V = 0 + for n in range(p-1,0,-1): + sn, cn = sn*c - cn*s, cn*c + sn*s + R = 1./(2*n/k + R) + V = R*(sn/n+V) + + return 0.5+x/(2*np.pi) + V/np.pi + + +def von_mises_cdf_normalapprox(k,x,C1): + b = np.sqrt(2/np.pi)*np.exp(k)/i0(k) + z = b*np.sin(x/2.) + return scipy.stats.norm.cdf(z) + + +def von_mises_cdf(k,x): + ix = 2*np.pi*np.round(x/(2*np.pi)) + x = x-ix + k = float(k) + + # These values should give 12 decimal digits + CK = 50 + a = [28., 0.5, 100., 5.0] + C1 = 50.1 + + if k < CK: + p = int(np.ceil(a[0]+a[1]*k-a[2]/(k+a[3]))) + + F = np.clip(von_mises_cdf_series(k,x,p),0,1) + else: + F = von_mises_cdf_normalapprox(k,x,C1) + + return F+ix diff --git a/pywafo/src/wafo/stats/vonmises_cython.pyx b/pywafo/src/wafo/stats/vonmises_cython.pyx new file mode 100644 index 0000000..4c24986 --- /dev/null +++ b/pywafo/src/wafo/stats/vonmises_cython.pyx @@ -0,0 +1,76 @@ +import numpy as np +import scipy.stats +from scipy.special import i0 +import numpy.testing +cimport numpy as np + +cdef extern from "math.h": + double cos(double theta) + double sin(double theta) + + +cdef double von_mises_cdf_series(double k,double x,unsigned int p): + cdef double s, c, sn, cn, R, V + cdef unsigned int n + s = sin(x) + c = cos(x) + sn = sin(p*x) + cn = cos(p*x) + R = 0 + V = 0 + for n in range(p-1,0,-1): + sn, cn = sn*c - cn*s, cn*c + sn*s + R = 1./(2*n/k + R) + V = R*(sn/n+V) + + return 0.5+x/(2*np.pi) + V/np.pi + +def von_mises_cdf_normalapprox(k,x,C1): + b = np.sqrt(2/np.pi)*np.exp(k)/i0(k) + z = b*np.sin(x/2.) + C = 24*k + chi = z - z**3/((C-2*z**2-16)/3.-(z**4+7/4.*z**2+167./2)/(C+C1-z**2+3))**2 + return scipy.stats.norm.cdf(z) + +cimport cython +@cython.boundscheck(False) +def von_mises_cdf(k,x): + cdef np.ndarray[double, ndim=1] temp, temp_xs, temp_ks + cdef unsigned int i, p + cdef double a1, a2, a3, a4, C1, CK + #k,x = np.broadcast_arrays(np.asarray(k),np.asarray(x)) + k = np.asarray(k) + x = np.asarray(x) + zerodim = k.ndim==0 and x.ndim==0 + + k = np.atleast_1d(k) + x = np.atleast_1d(x) + ix = np.round(x/(2*np.pi)) + x = x-ix*2*np.pi + + # These values should give 12 decimal digits + CK=50 + a1, a2, a3, a4 = [28., 0.5, 100., 5.0] + C1 = 50.1 + + bx, bk = np.broadcast_arrays(x,k) + result = np.empty(bx.shape,dtype=np.float) + + c_small_k = bk(1+a1+a2*temp_ks[i]-a3/(temp_ks[i]+a4)) + temp[i] = von_mises_cdf_series(temp_ks[i],temp_xs[i],p) + if temp[i]<0: + temp[i]=0 + elif temp[i]>1: + temp[i]=1 + result[c_small_k] = temp + result[~c_small_k] = von_mises_cdf_normalapprox(bk[~c_small_k],bx[~c_small_k],C1) + + if not zerodim: + return result+ix + else: + return (result+ix)[0]