Updated from wafo.stats from scipy.stats
parent
5c84825641
commit
6b88f2d4cc
@ -1,12 +1,346 @@
|
||||
"""
|
||||
Statistics package in WAFO Toolbox.
|
||||
==========================================
|
||||
Statistical functions (:mod:`scipy.stats`)
|
||||
==========================================
|
||||
|
||||
Readme - Readme file for module STATS in WAFO Toolbox
|
||||
.. module:: scipy.stats
|
||||
|
||||
This module contains a large number of probability distributions as
|
||||
well as a growing library of statistical functions.
|
||||
|
||||
Each included distribution is an instance of the class rv_continous:
|
||||
For each given name the following methods are available:
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
rv_continuous
|
||||
rv_continuous.pdf
|
||||
rv_continuous.logpdf
|
||||
rv_continuous.cdf
|
||||
rv_continuous.logcdf
|
||||
rv_continuous.sf
|
||||
rv_continuous.logsf
|
||||
rv_continuous.ppf
|
||||
rv_continuous.isf
|
||||
rv_continuous.moment
|
||||
rv_continuous.stats
|
||||
rv_continuous.entropy
|
||||
rv_continuous.fit
|
||||
rv_continuous.expect
|
||||
|
||||
Calling the instance as a function returns a frozen pdf whose shape,
|
||||
location, and scale parameters are fixed.
|
||||
|
||||
Similarly, each discrete distribution is an instance of the class
|
||||
rv_discrete:
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
rv_discrete
|
||||
rv_discrete.rvs
|
||||
rv_discrete.pmf
|
||||
rv_discrete.logpmf
|
||||
rv_discrete.cdf
|
||||
rv_discrete.logcdf
|
||||
rv_discrete.sf
|
||||
rv_discrete.logsf
|
||||
rv_discrete.ppf
|
||||
rv_discrete.isf
|
||||
rv_discrete.stats
|
||||
rv_discrete.moment
|
||||
rv_discrete.entropy
|
||||
rv_discrete.expect
|
||||
|
||||
Continuous distributions
|
||||
========================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
alpha -- Alpha
|
||||
anglit -- Anglit
|
||||
arcsine -- Arcsine
|
||||
beta -- Beta
|
||||
betaprime -- Beta Prime
|
||||
bradford -- Bradford
|
||||
burr -- Burr
|
||||
cauchy -- Cauchy
|
||||
chi -- Chi
|
||||
chi2 -- Chi-squared
|
||||
cosine -- Cosine
|
||||
dgamma -- Double Gamma
|
||||
dweibull -- Double Weibull
|
||||
erlang -- Erlang
|
||||
expon -- Exponential
|
||||
exponweib -- Exponentiated Weibull
|
||||
exponpow -- Exponential Power
|
||||
f -- F (Snecdor F)
|
||||
fatiguelife -- Fatigue Life (Birnbaum-Sanders)
|
||||
fisk -- Fisk
|
||||
foldcauchy -- Folded Cauchy
|
||||
foldnorm -- Folded Normal
|
||||
frechet_r -- Frechet Right Sided, Extreme Value Type II (Extreme LB) or weibull_min
|
||||
frechet_l -- Frechet Left Sided, Weibull_max
|
||||
genlogistic -- Generalized Logistic
|
||||
genpareto -- Generalized Pareto
|
||||
genexpon -- Generalized Exponential
|
||||
genextreme -- Generalized Extreme Value
|
||||
gausshyper -- Gauss Hypergeometric
|
||||
gamma -- Gamma
|
||||
gengamma -- Generalized gamma
|
||||
genhalflogistic -- Generalized Half Logistic
|
||||
gilbrat -- Gilbrat
|
||||
gompertz -- Gompertz (Truncated Gumbel)
|
||||
gumbel_r -- Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I
|
||||
gumbel_l -- Left Sided Gumbel, etc.
|
||||
halfcauchy -- Half Cauchy
|
||||
halflogistic -- Half Logistic
|
||||
halfnorm -- Half Normal
|
||||
hypsecant -- Hyperbolic Secant
|
||||
invgamma -- Inverse Gamma
|
||||
invgauss -- Inverse Gaussian
|
||||
invweibull -- Inverse Weibull
|
||||
johnsonsb -- Johnson SB
|
||||
johnsonsu -- Johnson SU
|
||||
ksone -- Kolmogorov-Smirnov one-sided (no stats)
|
||||
kstwobign -- Kolmogorov-Smirnov two-sided test for Large N (no stats)
|
||||
laplace -- Laplace
|
||||
logistic -- Logistic
|
||||
loggamma -- Log-Gamma
|
||||
loglaplace -- Log-Laplace (Log Double Exponential)
|
||||
lognorm -- Log-Normal
|
||||
lomax -- Lomax (Pareto of the second kind)
|
||||
maxwell -- Maxwell
|
||||
mielke -- Mielke's Beta-Kappa
|
||||
nakagami -- Nakagami
|
||||
ncx2 -- Non-central chi-squared
|
||||
ncf -- Non-central F
|
||||
nct -- Non-central Student's T
|
||||
norm -- Normal (Gaussian)
|
||||
pareto -- Pareto
|
||||
pearson3 -- Pearson type III
|
||||
powerlaw -- Power-function
|
||||
powerlognorm -- Power log normal
|
||||
powernorm -- Power normal
|
||||
rdist -- R-distribution
|
||||
reciprocal -- Reciprocal
|
||||
rayleigh -- Rayleigh
|
||||
rice -- Rice
|
||||
recipinvgauss -- Reciprocal Inverse Gaussian
|
||||
semicircular -- Semicircular
|
||||
t -- Student's T
|
||||
triang -- Triangular
|
||||
truncexpon -- Truncated Exponential
|
||||
truncnorm -- Truncated Normal
|
||||
tukeylambda -- Tukey-Lambda
|
||||
uniform -- Uniform
|
||||
vonmises -- Von-Mises (Circular)
|
||||
wald -- Wald
|
||||
weibull_min -- Minimum Weibull (see Frechet)
|
||||
weibull_max -- Maximum Weibull (see Frechet)
|
||||
wrapcauchy -- Wrapped Cauchy
|
||||
|
||||
Multivariate distributions
|
||||
==========================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
multivariate_normal -- Multivariate normal distribution
|
||||
|
||||
Discrete distributions
|
||||
======================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
bernoulli -- Bernoulli
|
||||
binom -- Binomial
|
||||
boltzmann -- Boltzmann (Truncated Discrete Exponential)
|
||||
dlaplace -- Discrete Laplacian
|
||||
geom -- Geometric
|
||||
hypergeom -- Hypergeometric
|
||||
logser -- Logarithmic (Log-Series, Series)
|
||||
nbinom -- Negative Binomial
|
||||
planck -- Planck (Discrete Exponential)
|
||||
poisson -- Poisson
|
||||
randint -- Discrete Uniform
|
||||
skellam -- Skellam
|
||||
zipf -- Zipf
|
||||
|
||||
Statistical functions
|
||||
=====================
|
||||
|
||||
Several of these functions have a similar version in scipy.stats.mstats
|
||||
which work for masked arrays.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
describe -- Descriptive statistics
|
||||
gmean -- Geometric mean
|
||||
hmean -- Harmonic mean
|
||||
kurtosis -- Fisher or Pearson kurtosis
|
||||
kurtosistest --
|
||||
mode -- Modal value
|
||||
moment -- Central moment
|
||||
normaltest --
|
||||
skew -- Skewness
|
||||
skewtest --
|
||||
tmean -- Truncated arithmetic mean
|
||||
tvar -- Truncated variance
|
||||
tmin --
|
||||
tmax --
|
||||
tstd --
|
||||
tsem --
|
||||
nanmean -- Mean, ignoring NaN values
|
||||
nanstd -- Standard deviation, ignoring NaN values
|
||||
nanmedian -- Median, ignoring NaN values
|
||||
variation -- Coefficient of variation
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
cumfreq _
|
||||
histogram2 _
|
||||
histogram _
|
||||
itemfreq _
|
||||
percentileofscore _
|
||||
scoreatpercentile _
|
||||
relfreq _
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
binned_statistic -- Compute a binned statistic for a set of data.
|
||||
binned_statistic_2d -- Compute a 2-D binned statistic for a set of data.
|
||||
binned_statistic_dd -- Compute a d-D binned statistic for a set of data.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
obrientransform
|
||||
signaltonoise
|
||||
bayes_mvs
|
||||
sem
|
||||
zmap
|
||||
zscore
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
threshold
|
||||
trimboth
|
||||
trim1
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
f_oneway
|
||||
pearsonr
|
||||
spearmanr
|
||||
pointbiserialr
|
||||
kendalltau
|
||||
linregress
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ttest_1samp
|
||||
ttest_ind
|
||||
ttest_rel
|
||||
kstest
|
||||
chisquare
|
||||
power_divergence
|
||||
ks_2samp
|
||||
mannwhitneyu
|
||||
tiecorrect
|
||||
rankdata
|
||||
ranksums
|
||||
wilcoxon
|
||||
kruskal
|
||||
friedmanchisquare
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ansari
|
||||
bartlett
|
||||
levene
|
||||
shapiro
|
||||
anderson
|
||||
binom_test
|
||||
fligner
|
||||
mood
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
boxcox
|
||||
boxcox_normmax
|
||||
boxcox_llf
|
||||
|
||||
Contingency table functions
|
||||
===========================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
chi2_contingency
|
||||
contingency.expected_freq
|
||||
contingency.margins
|
||||
fisher_exact
|
||||
|
||||
Plot-tests
|
||||
==========
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ppcc_max
|
||||
ppcc_plot
|
||||
probplot
|
||||
boxcox_normplot
|
||||
|
||||
|
||||
Masked statistics functions
|
||||
===========================
|
||||
|
||||
.. toctree::
|
||||
|
||||
stats.mstats
|
||||
|
||||
|
||||
Univariate and multivariate kernel density estimation (:mod:`scipy.stats.kde`)
|
||||
==============================================================================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
gaussian_kde
|
||||
|
||||
For many more stat related functions install the software R and the
|
||||
interface package rpy.
|
||||
|
||||
"""
|
||||
from __future__ import division, print_function, absolute_import
|
||||
from scipy.stats import *
|
||||
from core import *
|
||||
import distributions #@Reimport
|
||||
from wafo.stats.distributions import *
|
||||
from .core import *
|
||||
from .stats import *
|
||||
from .distributions import *
|
||||
from .rv import *
|
||||
from .morestats import *
|
||||
from ._binned_statistic import *
|
||||
from .kde import gaussian_kde
|
||||
from . import mstats
|
||||
from .contingency import chi2_contingency
|
||||
from ._multivariate import *
|
||||
from . import estimation
|
||||
|
||||
#remove vonmises_cython from __all__, I don't know why it is included
|
||||
__all__ = [s for s in dir() if not (s.startswith('_') or s.endswith('cython'))]
|
||||
#import distributions #@Reimport
|
||||
#from wafo.stats.distributions import *
|
||||
|
||||
import estimation
|
@ -0,0 +1,762 @@
|
||||
#
|
||||
# Author: Travis Oliphant 2002-2011 with contributions from
|
||||
# SciPy Developers 2004-2011
|
||||
#
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from scipy import special
|
||||
from scipy.special import gammaln as gamln
|
||||
|
||||
from numpy import floor, ceil, log, exp, sqrt, log1p, expm1, tanh, cosh, sinh
|
||||
|
||||
import numpy as np
|
||||
import numpy.random as mtrand
|
||||
|
||||
from ._distn_infrastructure import (
|
||||
rv_discrete, _lazywhere, _ncx2_pdf, _ncx2_cdf)
|
||||
|
||||
__all__ = [
|
||||
'binom', 'bernoulli', 'nbinom', 'geom', 'hypergeom',
|
||||
'logser', 'poisson', 'planck', 'boltzmann', 'randint',
|
||||
'zipf', 'dlaplace', 'skellam'
|
||||
]
|
||||
|
||||
|
||||
class binom_gen(rv_discrete):
|
||||
"""A binomial discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `binom` is::
|
||||
|
||||
binom.pmf(k) = choose(n, k) * p**k * (1-p)**(n-k)
|
||||
|
||||
for ``k`` in ``{0, 1,..., n}``.
|
||||
|
||||
`binom` takes ``n`` and ``p`` as shape parameters.
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, n, p):
|
||||
return mtrand.binomial(n, p, self._size)
|
||||
|
||||
def _argcheck(self, n, p):
|
||||
self.b = n
|
||||
return (n >= 0) & (p >= 0) & (p <= 1)
|
||||
|
||||
def _logpmf(self, x, n, p):
|
||||
k = floor(x)
|
||||
combiln = (gamln(n+1) - (gamln(k+1) + gamln(n-k+1)))
|
||||
return combiln + special.xlogy(k, p) + special.xlog1py(n-k, -p)
|
||||
|
||||
def _pmf(self, x, n, p):
|
||||
return exp(self._logpmf(x, n, p))
|
||||
|
||||
def _cdf(self, x, n, p):
|
||||
k = floor(x)
|
||||
vals = special.bdtr(k, n, p)
|
||||
return vals
|
||||
|
||||
def _sf(self, x, n, p):
|
||||
k = floor(x)
|
||||
return special.bdtrc(k, n, p)
|
||||
|
||||
def _ppf(self, q, n, p):
|
||||
vals = ceil(special.bdtrik(q, n, p))
|
||||
vals1 = vals-1
|
||||
temp = special.bdtr(vals1, n, p)
|
||||
return np.where(temp >= q, vals1, vals)
|
||||
|
||||
def _stats(self, n, p):
|
||||
q = 1.0-p
|
||||
mu = n * p
|
||||
var = n * p * q
|
||||
g1 = (q-p) / sqrt(n*p*q)
|
||||
g2 = (1.0-6*p*q)/(n*p*q)
|
||||
return mu, var, g1, g2
|
||||
|
||||
def _entropy(self, n, p):
|
||||
k = np.r_[0:n + 1]
|
||||
vals = self._pmf(k, n, p)
|
||||
h = -np.sum(special.xlogy(vals, vals), axis=0)
|
||||
return h
|
||||
binom = binom_gen(name='binom')
|
||||
|
||||
|
||||
class bernoulli_gen(binom_gen):
|
||||
"""A Bernoulli discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `bernoulli` is::
|
||||
|
||||
bernoulli.pmf(k) = 1-p if k = 0
|
||||
= p if k = 1
|
||||
|
||||
for ``k`` in ``{0, 1}``.
|
||||
|
||||
`bernoulli` takes ``p`` as shape parameter.
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, p):
|
||||
return binom_gen._rvs(self, 1, p)
|
||||
|
||||
def _argcheck(self, p):
|
||||
return (p >= 0) & (p <= 1)
|
||||
|
||||
def _logpmf(self, x, p):
|
||||
return binom._logpmf(x, 1, p)
|
||||
|
||||
def _pmf(self, x, p):
|
||||
return binom._pmf(x, 1, p)
|
||||
|
||||
def _cdf(self, x, p):
|
||||
return binom._cdf(x, 1, p)
|
||||
|
||||
def _sf(self, x, p):
|
||||
return binom._sf(x, 1, p)
|
||||
|
||||
def _ppf(self, q, p):
|
||||
return binom._ppf(q, 1, p)
|
||||
|
||||
def _stats(self, p):
|
||||
return binom._stats(1, p)
|
||||
|
||||
def _entropy(self, p):
|
||||
h = -special.xlogy(p, p) - special.xlogy(1 - p, 1 - p)
|
||||
return h
|
||||
bernoulli = bernoulli_gen(b=1, name='bernoulli')
|
||||
|
||||
|
||||
class nbinom_gen(rv_discrete):
|
||||
"""A negative binomial discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `nbinom` is::
|
||||
|
||||
nbinom.pmf(k) = choose(k+n-1, n-1) * p**n * (1-p)**k
|
||||
|
||||
for ``k >= 0``.
|
||||
|
||||
`nbinom` takes ``n`` and ``p`` as shape parameters.
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, n, p):
|
||||
return mtrand.negative_binomial(n, p, self._size)
|
||||
|
||||
def _argcheck(self, n, p):
|
||||
return (n >= 0) & (p >= 0) & (p <= 1)
|
||||
|
||||
def _pmf(self, x, n, p):
|
||||
return exp(self._logpmf(x, n, p))
|
||||
|
||||
def _logpmf(self, x, n, p):
|
||||
coeff = gamln(n+x) - gamln(x+1) - gamln(n)
|
||||
return coeff + n*log(p) + x*log1p(-p)
|
||||
|
||||
def _cdf(self, x, n, p):
|
||||
k = floor(x)
|
||||
return special.betainc(n, k+1, p)
|
||||
|
||||
def _sf_skip(self, x, n, p):
|
||||
# skip because special.nbdtrc doesn't work for 0<n<1
|
||||
k = floor(x)
|
||||
return special.nbdtrc(k, n, p)
|
||||
|
||||
def _ppf(self, q, n, p):
|
||||
vals = ceil(special.nbdtrik(q, n, p))
|
||||
vals1 = (vals-1).clip(0.0, np.inf)
|
||||
temp = self._cdf(vals1, n, p)
|
||||
return np.where(temp >= q, vals1, vals)
|
||||
|
||||
def _stats(self, n, p):
|
||||
Q = 1.0 / p
|
||||
P = Q - 1.0
|
||||
mu = n*P
|
||||
var = n*P*Q
|
||||
g1 = (Q+P)/sqrt(n*P*Q)
|
||||
g2 = (1.0 + 6*P*Q) / (n*P*Q)
|
||||
return mu, var, g1, g2
|
||||
nbinom = nbinom_gen(name='nbinom')
|
||||
|
||||
|
||||
class geom_gen(rv_discrete):
|
||||
"""A geometric discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `geom` is::
|
||||
|
||||
geom.pmf(k) = (1-p)**(k-1)*p
|
||||
|
||||
for ``k >= 1``.
|
||||
|
||||
`geom` takes ``p`` as shape parameter.
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, p):
|
||||
return mtrand.geometric(p, size=self._size)
|
||||
|
||||
def _argcheck(self, p):
|
||||
return (p <= 1) & (p >= 0)
|
||||
|
||||
def _pmf(self, k, p):
|
||||
return np.power(1-p, k-1) * p
|
||||
|
||||
def _logpmf(self, k, p):
|
||||
return (k-1)*log1p(-p) + log(p)
|
||||
|
||||
def _cdf(self, x, p):
|
||||
k = floor(x)
|
||||
return -expm1(log1p(-p)*k)
|
||||
|
||||
def _sf(self, x, p):
|
||||
return np.exp(self._logsf(x, p))
|
||||
|
||||
def _logsf(self, x, p):
|
||||
k = floor(x)
|
||||
return k*log1p(-p)
|
||||
|
||||
def _ppf(self, q, p):
|
||||
vals = ceil(log1p(-q)/log1p(-p))
|
||||
temp = self._cdf(vals-1, p)
|
||||
return np.where((temp >= q) & (vals > 0), vals-1, vals)
|
||||
|
||||
def _stats(self, p):
|
||||
mu = 1.0/p
|
||||
qr = 1.0-p
|
||||
var = qr / p / p
|
||||
g1 = (2.0-p) / sqrt(qr)
|
||||
g2 = np.polyval([1, -6, 6], p)/(1.0-p)
|
||||
return mu, var, g1, g2
|
||||
geom = geom_gen(a=1, name='geom', longname="A geometric")
|
||||
|
||||
|
||||
class hypergeom_gen(rv_discrete):
|
||||
"""A hypergeometric discrete random variable.
|
||||
|
||||
The hypergeometric distribution models drawing objects from a bin.
|
||||
M is the total number of objects, n is total number of Type I objects.
|
||||
The random variate represents the number of Type I objects in N drawn
|
||||
without replacement from the total population.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function is defined as::
|
||||
|
||||
pmf(k, M, n, N) = choose(n, k) * choose(M - n, N - k) / choose(M, N),
|
||||
for max(0, N - (M-n)) <= k <= min(n, N)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import hypergeom
|
||||
|
||||
Suppose we have a collection of 20 animals, of which 7 are dogs. Then if
|
||||
we want to know the probability of finding a given number of dogs if we
|
||||
choose at random 12 of the 20 animals, we can initialize a frozen
|
||||
distribution and plot the probability mass function:
|
||||
|
||||
>>> [M, n, N] = [20, 7, 12]
|
||||
>>> rv = hypergeom(M, n, N)
|
||||
>>> x = np.arange(0, n+1)
|
||||
>>> pmf_dogs = rv.pmf(x)
|
||||
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111)
|
||||
>>> ax.plot(x, pmf_dogs, 'bo')
|
||||
>>> ax.vlines(x, 0, pmf_dogs, lw=2)
|
||||
>>> ax.set_xlabel('# of dogs in our group of chosen animals')
|
||||
>>> ax.set_ylabel('hypergeom PMF')
|
||||
>>> plt.show()
|
||||
|
||||
Instead of using a frozen distribution we can also use `hypergeom`
|
||||
methods directly. To for example obtain the cumulative distribution
|
||||
function, use:
|
||||
|
||||
>>> prb = hypergeom.cdf(x, M, n, N)
|
||||
|
||||
And to generate random numbers:
|
||||
|
||||
>>> R = hypergeom.rvs(M, n, N, size=10)
|
||||
|
||||
"""
|
||||
def _rvs(self, M, n, N):
|
||||
return mtrand.hypergeometric(n, M-n, N, size=self._size)
|
||||
|
||||
def _argcheck(self, M, n, N):
|
||||
cond = rv_discrete._argcheck(self, M, n, N)
|
||||
cond &= (n <= M) & (N <= M)
|
||||
self.a = max(N-(M-n), 0)
|
||||
self.b = min(n, N)
|
||||
return cond
|
||||
|
||||
def _logpmf(self, k, M, n, N):
|
||||
tot, good = M, n
|
||||
bad = tot - good
|
||||
return gamln(good+1) - gamln(good-k+1) - gamln(k+1) + gamln(bad+1) \
|
||||
- gamln(bad-N+k+1) - gamln(N-k+1) - gamln(tot+1) + gamln(tot-N+1) \
|
||||
+ gamln(N+1)
|
||||
|
||||
def _pmf(self, k, M, n, N):
|
||||
# same as the following but numerically more precise
|
||||
# return comb(good, k) * comb(bad, N-k) / comb(tot, N)
|
||||
return exp(self._logpmf(k, M, n, N))
|
||||
|
||||
def _stats(self, M, n, N):
|
||||
# tot, good, sample_size = M, n, N
|
||||
# "wikipedia".replace('N', 'M').replace('n', 'N').replace('K', 'n')
|
||||
M, n, N = 1.*M, 1.*n, 1.*N
|
||||
m = M - n
|
||||
p = n/M
|
||||
mu = N*p
|
||||
|
||||
var = m*n*N*(M - N)*1.0/(M*M*(M-1))
|
||||
g1 = (m - n)*(M-2*N) / (M-2.0) * sqrt((M-1.0) / (m*n*N*(M-N)))
|
||||
|
||||
g2 = M*(M+1) - 6.*N*(M-N) - 6.*n*m
|
||||
g2 *= (M-1)*M*M
|
||||
g2 += 6.*n*N*(M-N)*m*(5.*M-6)
|
||||
g2 /= n * N * (M-N) * m * (M-2.) * (M-3.)
|
||||
return mu, var, g1, g2
|
||||
|
||||
def _entropy(self, M, n, N):
|
||||
k = np.r_[N - (M - n):min(n, N) + 1]
|
||||
vals = self.pmf(k, M, n, N)
|
||||
h = -np.sum(special.xlogy(vals, vals), axis=0)
|
||||
return h
|
||||
|
||||
def _sf(self, k, M, n, N):
|
||||
"""More precise calculation, 1 - cdf doesn't cut it."""
|
||||
# This for loop is needed because `k` can be an array. If that's the
|
||||
# case, the sf() method makes M, n and N arrays of the same shape. We
|
||||
# therefore unpack all inputs args, so we can do the manual
|
||||
# integration.
|
||||
res = []
|
||||
for quant, tot, good, draw in zip(k, M, n, N):
|
||||
# Manual integration over probability mass function. More accurate
|
||||
# than integrate.quad.
|
||||
k2 = np.arange(quant + 1, draw + 1)
|
||||
res.append(np.sum(self._pmf(k2, tot, good, draw)))
|
||||
return np.asarray(res)
|
||||
hypergeom = hypergeom_gen(name='hypergeom')
|
||||
|
||||
|
||||
# FIXME: Fails _cdfvec
|
||||
class logser_gen(rv_discrete):
|
||||
"""A Logarithmic (Log-Series, Series) discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `logser` is::
|
||||
|
||||
logser.pmf(k) = - p**k / (k*log(1-p))
|
||||
|
||||
for ``k >= 1``.
|
||||
|
||||
`logser` takes ``p`` as shape parameter.
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, p):
|
||||
# looks wrong for p>0.5, too few k=1
|
||||
# trying to use generic is worse, no k=1 at all
|
||||
return mtrand.logseries(p, size=self._size)
|
||||
|
||||
def _argcheck(self, p):
|
||||
return (p > 0) & (p < 1)
|
||||
|
||||
def _pmf(self, k, p):
|
||||
return -np.power(p, k) * 1.0 / k / log1p(- p)
|
||||
|
||||
def _stats(self, p):
|
||||
r = log1p(-p)
|
||||
mu = p / (p - 1.0) / r
|
||||
mu2p = -p / r / (p - 1.0)**2
|
||||
var = mu2p - mu*mu
|
||||
mu3p = -p / r * (1.0+p) / (1.0 - p)**3
|
||||
mu3 = mu3p - 3*mu*mu2p + 2*mu**3
|
||||
g1 = mu3 / np.power(var, 1.5)
|
||||
|
||||
mu4p = -p / r * (
|
||||
1.0 / (p-1)**2 - 6*p / (p - 1)**3 + 6*p*p / (p-1)**4)
|
||||
mu4 = mu4p - 4*mu3p*mu + 6*mu2p*mu*mu - 3*mu**4
|
||||
g2 = mu4 / var**2 - 3.0
|
||||
return mu, var, g1, g2
|
||||
logser = logser_gen(a=1, name='logser', longname='A logarithmic')
|
||||
|
||||
|
||||
class poisson_gen(rv_discrete):
|
||||
"""A Poisson discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `poisson` is::
|
||||
|
||||
poisson.pmf(k) = exp(-mu) * mu**k / k!
|
||||
|
||||
for ``k >= 0``.
|
||||
|
||||
`poisson` takes ``mu`` as shape parameter.
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, mu):
|
||||
return mtrand.poisson(mu, self._size)
|
||||
|
||||
def _logpmf(self, k, mu):
|
||||
Pk = k*log(mu)-gamln(k+1) - mu
|
||||
return Pk
|
||||
|
||||
def _pmf(self, k, mu):
|
||||
return exp(self._logpmf(k, mu))
|
||||
|
||||
def _cdf(self, x, mu):
|
||||
k = floor(x)
|
||||
return special.pdtr(k, mu)
|
||||
|
||||
def _sf(self, x, mu):
|
||||
k = floor(x)
|
||||
return special.pdtrc(k, mu)
|
||||
|
||||
def _ppf(self, q, mu):
|
||||
vals = ceil(special.pdtrik(q, mu))
|
||||
vals1 = vals - 1
|
||||
temp = special.pdtr(vals1, mu)
|
||||
return np.where((temp >= q), vals1, vals)
|
||||
|
||||
def _stats(self, mu):
|
||||
var = mu
|
||||
tmp = np.asarray(mu)
|
||||
g1 = sqrt(1.0 / tmp)
|
||||
g2 = 1.0 / tmp
|
||||
return mu, var, g1, g2
|
||||
poisson = poisson_gen(name="poisson", longname='A Poisson')
|
||||
|
||||
|
||||
class planck_gen(rv_discrete):
|
||||
"""A Planck discrete exponential random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `planck` is::
|
||||
|
||||
planck.pmf(k) = (1-exp(-lambda_))*exp(-lambda_*k)
|
||||
|
||||
for ``k*lambda_ >= 0``.
|
||||
|
||||
`planck` takes ``lambda_`` as shape parameter.
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _argcheck(self, lambda_):
|
||||
if (lambda_ > 0):
|
||||
self.a = 0
|
||||
self.b = np.inf
|
||||
return 1
|
||||
elif (lambda_ < 0):
|
||||
self.a = -np.inf
|
||||
self.b = 0
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def _pmf(self, k, lambda_):
|
||||
fact = -expm1(-lambda_)
|
||||
return fact * exp(-lambda_ * k)
|
||||
|
||||
def _cdf(self, x, lambda_):
|
||||
k = floor(x)
|
||||
return - expm1(-lambda_ * (k + 1))
|
||||
|
||||
def _ppf(self, q, lambda_):
|
||||
vals = ceil(-1.0/lambda_ * log1p(-q)-1)
|
||||
vals1 = (vals-1).clip(self.a, np.inf)
|
||||
temp = self._cdf(vals1, lambda_)
|
||||
return np.where(temp >= q, vals1, vals)
|
||||
|
||||
def _stats(self, lambda_):
|
||||
mu = 1/(exp(lambda_)-1)
|
||||
var = exp(-lambda_)/(expm1(-lambda_))**2
|
||||
g1 = 2*cosh(lambda_/2.0)
|
||||
g2 = 4+2*cosh(lambda_)
|
||||
return mu, var, g1, g2
|
||||
|
||||
def _entropy(self, lambda_):
|
||||
l = lambda_
|
||||
C = -expm1(-l)
|
||||
return l * exp(-l) / C - log(C)
|
||||
planck = planck_gen(name='planck', longname='A discrete exponential ')
|
||||
|
||||
|
||||
class boltzmann_gen(rv_discrete):
|
||||
"""A Boltzmann (Truncated Discrete Exponential) random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `boltzmann` is::
|
||||
|
||||
boltzmann.pmf(k) = (1-exp(-lambda_)*exp(-lambda_*k)/(1-exp(-lambda_*N))
|
||||
|
||||
for ``k = 0,..., N-1``.
|
||||
|
||||
`boltzmann` takes ``lambda_`` and ``N`` as shape parameters.
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _pmf(self, k, lambda_, N):
|
||||
fact = (expm1(-lambda_))/(expm1(-lambda_*N))
|
||||
return fact*exp(-lambda_*k)
|
||||
|
||||
def _cdf(self, x, lambda_, N):
|
||||
k = floor(x)
|
||||
return (expm1(-lambda_*(k+1)))/(expm1(-lambda_*N))
|
||||
|
||||
def _ppf(self, q, lambda_, N):
|
||||
qnew = -q*(expm1(-lambda_*N))
|
||||
vals = ceil(-1.0/lambda_ * log1p(-qnew)-1)
|
||||
vals1 = (vals-1).clip(0.0, np.inf)
|
||||
temp = self._cdf(vals1, lambda_, N)
|
||||
return np.where(temp >= q, vals1, vals)
|
||||
|
||||
def _stats(self, lambda_, N):
|
||||
z = exp(-lambda_)
|
||||
zN = exp(-lambda_*N)
|
||||
mu = z/(1.0-z)-N*zN/(1-zN)
|
||||
var = z/(1.0-z)**2 - N*N*zN/(1-zN)**2
|
||||
trm = (1-zN)/(1-z)
|
||||
trm2 = (z*trm**2 - N*N*zN)
|
||||
g1 = z*(1+z)*trm**3 - N**3*zN*(1+zN)
|
||||
g1 = g1 / trm2**(1.5)
|
||||
g2 = z*(1+4*z+z*z)*trm**4 - N**4 * zN*(1+4*zN+zN*zN)
|
||||
g2 = g2 / trm2 / trm2
|
||||
return mu, var, g1, g2
|
||||
boltzmann = boltzmann_gen(name='boltzmann',
|
||||
longname='A truncated discrete exponential ')
|
||||
|
||||
|
||||
class randint_gen(rv_discrete):
|
||||
"""A uniform discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `randint` is::
|
||||
|
||||
randint.pmf(k) = 1./(high - low)
|
||||
|
||||
for ``k = low, ..., high - 1``.
|
||||
|
||||
`randint` takes ``low`` and ``high`` as shape parameters.
|
||||
|
||||
Note the difference to the numpy ``random_integers`` which
|
||||
returns integers on a *closed* interval ``[low, high]``.
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _argcheck(self, low, high):
|
||||
self.a = low
|
||||
self.b = high - 1
|
||||
return (high > low)
|
||||
|
||||
def _pmf(self, k, low, high):
|
||||
p = np.ones_like(k) / (high - low)
|
||||
return np.where((k >= low) & (k < high), p, 0.)
|
||||
|
||||
def _cdf(self, x, low, high):
|
||||
k = floor(x)
|
||||
return (k - low + 1.) / (high - low)
|
||||
|
||||
def _ppf(self, q, low, high):
|
||||
vals = ceil(q * (high - low) + low) - 1
|
||||
vals1 = (vals - 1).clip(low, high)
|
||||
temp = self._cdf(vals1, low, high)
|
||||
return np.where(temp >= q, vals1, vals)
|
||||
|
||||
def _stats(self, low, high):
|
||||
m2, m1 = np.asarray(high), np.asarray(low)
|
||||
mu = (m2 + m1 - 1.0) / 2
|
||||
d = m2 - m1
|
||||
var = (d*d - 1) / 12.0
|
||||
g1 = 0.0
|
||||
g2 = -6.0/5.0 * (d*d + 1.0) / (d*d - 1.0)
|
||||
return mu, var, g1, g2
|
||||
|
||||
def _rvs(self, low, high=None):
|
||||
"""An array of *size* random integers >= ``low`` and < ``high``.
|
||||
|
||||
If ``high`` is ``None``, then range is >=0 and < low
|
||||
"""
|
||||
return mtrand.randint(low, high, self._size)
|
||||
|
||||
def _entropy(self, low, high):
|
||||
return log(high - low)
|
||||
randint = randint_gen(name='randint', longname='A discrete uniform '
|
||||
'(random integer)')
|
||||
|
||||
|
||||
# FIXME: problems sampling.
|
||||
class zipf_gen(rv_discrete):
|
||||
"""A Zipf discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `zipf` is::
|
||||
|
||||
zipf.pmf(k, a) = 1/(zeta(a) * k**a)
|
||||
|
||||
for ``k >= 1``.
|
||||
|
||||
`zipf` takes ``a`` as shape parameter.
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, a):
|
||||
return mtrand.zipf(a, size=self._size)
|
||||
|
||||
def _argcheck(self, a):
|
||||
return a > 1
|
||||
|
||||
def _pmf(self, k, a):
|
||||
Pk = 1.0 / special.zeta(a, 1) / k**a
|
||||
return Pk
|
||||
|
||||
def _munp(self, n, a):
|
||||
return _lazywhere(
|
||||
a > n + 1, (a, n),
|
||||
lambda a, n: special.zeta(a - n, 1) / special.zeta(a, 1),
|
||||
np.inf)
|
||||
zipf = zipf_gen(a=1, name='zipf', longname='A Zipf')
|
||||
|
||||
|
||||
class dlaplace_gen(rv_discrete):
|
||||
"""A Laplacian discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `dlaplace` is::
|
||||
|
||||
dlaplace.pmf(k) = tanh(a/2) * exp(-a*abs(k))
|
||||
|
||||
for ``a > 0``.
|
||||
|
||||
`dlaplace` takes ``a`` as shape parameter.
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _pmf(self, k, a):
|
||||
return tanh(a/2.0) * exp(-a * abs(k))
|
||||
|
||||
def _cdf(self, x, a):
|
||||
k = floor(x)
|
||||
f = lambda k, a: 1.0 - exp(-a * k) / (exp(a) + 1)
|
||||
f2 = lambda k, a: exp(a * (k+1)) / (exp(a) + 1)
|
||||
return _lazywhere(k >= 0, (k, a), f=f, f2=f2)
|
||||
|
||||
def _ppf(self, q, a):
|
||||
const = 1 + exp(a)
|
||||
vals = ceil(np.where(q < 1.0 / (1 + exp(-a)), log(q*const) / a - 1,
|
||||
-log((1-q) * const) / a))
|
||||
vals1 = vals - 1
|
||||
return np.where(self._cdf(vals1, a) >= q, vals1, vals)
|
||||
|
||||
def _stats(self, a):
|
||||
ea = exp(a)
|
||||
mu2 = 2.*ea/(ea-1.)**2
|
||||
mu4 = 2.*ea*(ea**2+10.*ea+1.) / (ea-1.)**4
|
||||
return 0., mu2, 0., mu4/mu2**2 - 3.
|
||||
|
||||
def _entropy(self, a):
|
||||
return a / sinh(a) - log(tanh(a/2.0))
|
||||
dlaplace = dlaplace_gen(a=-np.inf,
|
||||
name='dlaplace', longname='A discrete Laplacian')
|
||||
|
||||
|
||||
class skellam_gen(rv_discrete):
|
||||
"""A Skellam discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
Probability distribution of the difference of two correlated or
|
||||
uncorrelated Poisson random variables.
|
||||
|
||||
Let k1 and k2 be two Poisson-distributed r.v. with expected values
|
||||
lam1 and lam2. Then, ``k1 - k2`` follows a Skellam distribution with
|
||||
parameters ``mu1 = lam1 - rho*sqrt(lam1*lam2)`` and
|
||||
``mu2 = lam2 - rho*sqrt(lam1*lam2)``, where rho is the correlation
|
||||
coefficient between k1 and k2. If the two Poisson-distributed r.v.
|
||||
are independent then ``rho = 0``.
|
||||
|
||||
Parameters mu1 and mu2 must be strictly positive.
|
||||
|
||||
For details see: http://en.wikipedia.org/wiki/Skellam_distribution
|
||||
|
||||
`skellam` takes ``mu1`` and ``mu2`` as shape parameters.
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, mu1, mu2):
|
||||
n = self._size
|
||||
return mtrand.poisson(mu1, n) - mtrand.poisson(mu2, n)
|
||||
|
||||
def _pmf(self, x, mu1, mu2):
|
||||
px = np.where(x < 0,
|
||||
_ncx2_pdf(2*mu2, 2*(1-x), 2*mu1)*2,
|
||||
_ncx2_pdf(2*mu1, 2*(1+x), 2*mu2)*2)
|
||||
# ncx2.pdf() returns nan's for extremely low probabilities
|
||||
return px
|
||||
|
||||
def _cdf(self, x, mu1, mu2):
|
||||
x = floor(x)
|
||||
px = np.where(x < 0,
|
||||
_ncx2_cdf(2*mu2, -2*x, 2*mu1),
|
||||
1-_ncx2_cdf(2*mu1, 2*(x+1), 2*mu2))
|
||||
return px
|
||||
|
||||
def _stats(self, mu1, mu2):
|
||||
mean = mu1 - mu2
|
||||
var = mu1 + mu2
|
||||
g1 = mean / sqrt((var)**3)
|
||||
g2 = 1 / var
|
||||
return mean, var, g1, g2
|
||||
skellam = skellam_gen(a=-np.inf, name="skellam", longname='A Skellam')
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,493 @@
|
||||
#
|
||||
# Author: Joris Vankerschaver 2013
|
||||
#
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from scipy.misc import doccer
|
||||
from functools import wraps
|
||||
import numpy as np
|
||||
import scipy.linalg
|
||||
|
||||
__all__ = ['multivariate_normal']
|
||||
|
||||
|
||||
_LOG_2PI = np.log(2 * np.pi)
|
||||
|
||||
|
||||
def _process_parameters(dim, mean, cov):
|
||||
"""
|
||||
Infer dimensionality from mean or covariance matrix, ensure that
|
||||
mean and covariance are full vector resp. matrix.
|
||||
|
||||
"""
|
||||
|
||||
# Try to infer dimensionality
|
||||
if dim is None:
|
||||
if mean is None:
|
||||
if cov is None:
|
||||
dim = 1
|
||||
else:
|
||||
cov = np.asarray(cov, dtype=float)
|
||||
if cov.ndim < 2:
|
||||
dim = 1
|
||||
else:
|
||||
dim = cov.shape[0]
|
||||
else:
|
||||
mean = np.asarray(mean, dtype=float)
|
||||
dim = mean.size
|
||||
else:
|
||||
if not np.isscalar(dim):
|
||||
raise ValueError("Dimension of random variable must be a scalar.")
|
||||
|
||||
# Check input sizes and return full arrays for mean and cov if necessary
|
||||
if mean is None:
|
||||
mean = np.zeros(dim)
|
||||
mean = np.asarray(mean, dtype=float)
|
||||
|
||||
if cov is None:
|
||||
cov = 1.0
|
||||
cov = np.asarray(cov, dtype=float)
|
||||
|
||||
if dim == 1:
|
||||
mean.shape = (1,)
|
||||
cov.shape = (1, 1)
|
||||
|
||||
if mean.ndim != 1 or mean.shape[0] != dim:
|
||||
raise ValueError("Array 'mean' must be vector of length %d." % dim)
|
||||
if cov.ndim == 0:
|
||||
cov = cov * np.eye(dim)
|
||||
elif cov.ndim == 1:
|
||||
cov = np.diag(cov)
|
||||
else:
|
||||
if cov.shape != (dim, dim):
|
||||
raise ValueError("Array 'cov' must be at most two-dimensional,"
|
||||
" but cov.ndim = %d" % cov.ndim)
|
||||
|
||||
return dim, mean, cov
|
||||
|
||||
|
||||
def _process_quantiles(x, dim):
|
||||
"""
|
||||
Adjust quantiles array so that last axis labels the components of
|
||||
each data point.
|
||||
|
||||
"""
|
||||
x = np.asarray(x, dtype=float)
|
||||
|
||||
if x.ndim == 0:
|
||||
x = x[np.newaxis]
|
||||
elif x.ndim == 1:
|
||||
if dim == 1:
|
||||
x = x[:, np.newaxis]
|
||||
else:
|
||||
x = x[np.newaxis, :]
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def _squeeze_output(out):
|
||||
"""
|
||||
Remove single-dimensional entries from array and convert to scalar,
|
||||
if necessary.
|
||||
|
||||
"""
|
||||
out = out.squeeze()
|
||||
if out.ndim == 0:
|
||||
out = out[()]
|
||||
return out
|
||||
|
||||
|
||||
def _pinv_1d(v, eps=1e-5):
|
||||
"""
|
||||
A helper function for computing the pseudoinverse.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
v : iterable of numbers
|
||||
This may be thought of as a vector of eigenvalues or singular values.
|
||||
eps : float
|
||||
Elements of v smaller than eps are considered negligible.
|
||||
|
||||
Returns
|
||||
-------
|
||||
v_pinv : 1d float ndarray
|
||||
A vector of pseudo-inverted numbers.
|
||||
|
||||
"""
|
||||
return np.array([0 if abs(x) < eps else 1/x for x in v], dtype=float)
|
||||
|
||||
|
||||
def _psd_pinv_decomposed_log_pdet(mat, cond=None, rcond=None,
|
||||
lower=True, check_finite=True):
|
||||
"""
|
||||
Compute a decomposition of the pseudo-inverse and the logarithm of
|
||||
the pseudo-determinant of a symmetric positive semi-definite
|
||||
matrix.
|
||||
|
||||
The pseudo-determinant of a matrix is defined as the product of
|
||||
the non-zero eigenvalues, and coincides with the usual determinant
|
||||
for a full matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mat : array_like
|
||||
Input array of shape (`m`, `n`)
|
||||
cond, rcond : float or None
|
||||
Cutoff for 'small' singular values.
|
||||
Eigenvalues smaller than ``rcond*largest_eigenvalue``
|
||||
are considered zero.
|
||||
If None or -1, suitable machine precision is used.
|
||||
lower : bool, optional
|
||||
Whether the pertinent array data is taken from the lower or upper
|
||||
triangle of `mat`. (Default: lower)
|
||||
check_finite : boolean, optional
|
||||
Whether to check that the input matrix contains only finite numbers.
|
||||
Disabling may give a performance gain, but may result in problems
|
||||
(crashes, non-termination) if the inputs do contain infinities or NaNs.
|
||||
|
||||
Returns
|
||||
-------
|
||||
M : array_like
|
||||
The pseudo-inverse of the input matrix is np.dot(M, M.T).
|
||||
log_pdet : float
|
||||
Logarithm of the pseudo-determinant of the matrix.
|
||||
|
||||
"""
|
||||
# Compute the symmetric eigendecomposition.
|
||||
# The input covariance matrix is required to be real symmetric
|
||||
# and positive semidefinite which implies that its eigenvalues
|
||||
# are all real and non-negative,
|
||||
# but clip them anyway to avoid numerical issues.
|
||||
|
||||
# TODO: the code to set cond/rcond is identical to that in
|
||||
# scipy.linalg.{pinvh, pinv2} and if/when this function is subsumed
|
||||
# into scipy.linalg it should probably be shared between all of
|
||||
# these routines.
|
||||
|
||||
# Note that eigh takes care of array conversion, chkfinite,
|
||||
# and assertion that the matrix is square.
|
||||
s, u = scipy.linalg.eigh(mat, lower=lower, check_finite=check_finite)
|
||||
|
||||
if rcond is not None:
|
||||
cond = rcond
|
||||
if cond in [None, -1]:
|
||||
t = u.dtype.char.lower()
|
||||
factor = {'f': 1E3, 'd': 1E6}
|
||||
cond = factor[t] * np.finfo(t).eps
|
||||
eps = cond * np.max(abs(s))
|
||||
|
||||
if np.min(s) < -eps:
|
||||
raise ValueError('the covariance matrix must be positive semidefinite')
|
||||
|
||||
s_pinv = _pinv_1d(s, eps)
|
||||
U = np.multiply(u, np.sqrt(s_pinv))
|
||||
log_pdet = np.sum(np.log(s[s > eps]))
|
||||
|
||||
return U, log_pdet
|
||||
|
||||
|
||||
_doc_default_callparams = \
|
||||
"""mean : array_like, optional
|
||||
Mean of the distribution (default zero)
|
||||
cov : array_like, optional
|
||||
Covariance matrix of the distribution (default one)
|
||||
"""
|
||||
|
||||
_doc_callparams_note = \
|
||||
"""Setting the parameter `mean` to `None` is equivalent to having `mean`
|
||||
be the zero-vector. The parameter `cov` can be a scalar, in which case
|
||||
the covariance matrix is the identity times that value, a vector of
|
||||
diagonal entries for the covariance matrix, or a two-dimensional
|
||||
array_like.
|
||||
"""
|
||||
|
||||
_doc_frozen_callparams = ""
|
||||
|
||||
_doc_frozen_callparams_note = \
|
||||
"""See class definition for a detailed description of parameters."""
|
||||
|
||||
docdict_params = {
|
||||
'_doc_default_callparams': _doc_default_callparams,
|
||||
'_doc_callparams_note': _doc_callparams_note
|
||||
}
|
||||
|
||||
docdict_noparams = {
|
||||
'_doc_default_callparams': _doc_frozen_callparams,
|
||||
'_doc_callparams_note': _doc_frozen_callparams_note
|
||||
}
|
||||
|
||||
|
||||
class multivariate_normal_gen(object):
|
||||
r"""
|
||||
A multivariate normal random variable.
|
||||
|
||||
The `mean` keyword specifies the mean. The `cov` keyword specifies the
|
||||
covariance matrix.
|
||||
|
||||
.. versionadded:: 0.14.0
|
||||
|
||||
Methods
|
||||
-------
|
||||
pdf(x, mean=None, cov=1)
|
||||
Probability density function.
|
||||
logpdf(x, mean=None, cov=1)
|
||||
Log of the probability density function.
|
||||
rvs(mean=None, cov=1)
|
||||
Draw random samples from a multivariate normal distribution.
|
||||
entropy()
|
||||
Compute the differential entropy of the multivariate normal.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
Quantiles, with the last axis of `x` denoting the components.
|
||||
%(_doc_default_callparams)s
|
||||
|
||||
Alternatively, the object may be called (as a function) to fix the mean
|
||||
and covariance parameters, returning a "frozen" multivariate normal
|
||||
random variable:
|
||||
|
||||
rv = multivariate_normal(mean=None, scale=1)
|
||||
- Frozen object with the same methods but holding the given
|
||||
mean and covariance fixed.
|
||||
|
||||
Notes
|
||||
-----
|
||||
%(_doc_callparams_note)s
|
||||
|
||||
The covariance matrix `cov` must be a (symmetric) positive
|
||||
semi-definite matrix. The determinant and inverse of `cov` are computed
|
||||
as the pseudo-determinant and pseudo-inverse, respectively, so
|
||||
that `cov` does not need to have full rank.
|
||||
|
||||
The probability density function for `multivariate_normal` is
|
||||
|
||||
.. math::
|
||||
|
||||
f(x) = \frac{1}{\sqrt{(2 \pi)^k \det \Sigma}} \exp\left( -\frac{1}{2} (x - \mu)^T \Sigma^{-1} (x - \mu) \right),
|
||||
|
||||
where :math:`\mu` is the mean, :math:`\Sigma` the covariance matrix,
|
||||
and :math:`k` is the dimension of the space where :math:`x` takes values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import multivariate_normal
|
||||
>>> x = np.linspace(0, 5, 10, endpoint=False)
|
||||
>>> y = multivariate_normal.pdf(x, mean=2.5, cov=0.5); y
|
||||
array([ 0.00108914, 0.01033349, 0.05946514, 0.20755375, 0.43939129,
|
||||
0.56418958, 0.43939129, 0.20755375, 0.05946514, 0.01033349])
|
||||
>>> plt.plot(x, y)
|
||||
|
||||
The input quantiles can be any shape of array, as long as the last
|
||||
axis labels the components. This allows us for instance to
|
||||
display the frozen pdf for a non-isotropic random variable in 2D as
|
||||
follows:
|
||||
|
||||
>>> x, y = np.mgrid[-1:1:.01, -1:1:.01]
|
||||
>>> pos = np.empty(x.shape + (2,))
|
||||
>>> pos[:, :, 0] = x; pos[:, :, 1] = y
|
||||
>>> rv = multivariate_normal([0.5, -0.2], [[2.0, 0.3], [0.3, 0.5]])
|
||||
>>> plt.contourf(x, y, rv.pdf(pos))
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.__doc__ = doccer.docformat(self.__doc__, docdict_params)
|
||||
|
||||
def __call__(self, mean=None, cov=1):
|
||||
"""
|
||||
Create a frozen multivariate normal distribution.
|
||||
|
||||
See `multivariate_normal_frozen` for more information.
|
||||
|
||||
"""
|
||||
return multivariate_normal_frozen(mean, cov)
|
||||
|
||||
def _logpdf(self, x, mean, prec_U, log_det_cov):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
x : ndarray
|
||||
Points at which to evaluate the log of the probability
|
||||
density function
|
||||
mean : ndarray
|
||||
Mean of the distribution
|
||||
prec_U : ndarray
|
||||
A decomposition such that np.dot(prec_U, prec_U.T)
|
||||
is the precision matrix, i.e. inverse of the covariance matrix.
|
||||
log_det_cov : float
|
||||
Logarithm of the determinant of the covariance matrix
|
||||
|
||||
Notes
|
||||
-----
|
||||
As this function does no argument checking, it should not be
|
||||
called directly; use 'logpdf' instead.
|
||||
|
||||
"""
|
||||
dim = x.shape[-1]
|
||||
dev = x - mean
|
||||
maha = np.sum(np.square(np.dot(dev, prec_U)), axis=-1)
|
||||
return -0.5 * (dim * _LOG_2PI + log_det_cov + maha)
|
||||
|
||||
def logpdf(self, x, mean, cov):
|
||||
"""
|
||||
Log of the multivariate normal probability density function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
Quantiles, with the last axis of `x` denoting the components.
|
||||
%(_doc_default_callparams)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
%(_doc_callparams_note)s
|
||||
|
||||
Returns
|
||||
-------
|
||||
pdf : ndarray
|
||||
Log of the probability density function evaluated at `x`
|
||||
|
||||
"""
|
||||
dim, mean, cov = _process_parameters(None, mean, cov)
|
||||
x = _process_quantiles(x, dim)
|
||||
prec_U, log_det_cov = _psd_pinv_decomposed_log_pdet(cov)
|
||||
out = self._logpdf(x, mean, prec_U, log_det_cov)
|
||||
return _squeeze_output(out)
|
||||
|
||||
def pdf(self, x, mean, cov):
|
||||
"""
|
||||
Multivariate normal probability density function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
Quantiles, with the last axis of `x` denoting the components.
|
||||
%(_doc_default_callparams)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
%(_doc_callparams_note)s
|
||||
|
||||
Returns
|
||||
-------
|
||||
pdf : ndarray
|
||||
Probability density function evaluated at `x`
|
||||
|
||||
"""
|
||||
dim, mean, cov = _process_parameters(None, mean, cov)
|
||||
x = _process_quantiles(x, dim)
|
||||
prec_U, log_det_cov = _psd_pinv_decomposed_log_pdet(cov)
|
||||
out = np.exp(self._logpdf(x, mean, prec_U, log_det_cov))
|
||||
return _squeeze_output(out)
|
||||
|
||||
def rvs(self, mean=None, cov=1, size=1):
|
||||
"""
|
||||
Draw random samples from a multivariate normal distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
%(_doc_default_callparams)s
|
||||
size : integer, optional
|
||||
Number of samples to draw (default 1).
|
||||
|
||||
Notes
|
||||
-----
|
||||
%(_doc_callparams_note)s
|
||||
|
||||
Returns
|
||||
-------
|
||||
rvs : ndarray or scalar
|
||||
Random variates of size (`size`, `N`), where `N` is the
|
||||
dimension of the random variable.
|
||||
|
||||
"""
|
||||
dim, mean, cov = _process_parameters(None, mean, cov)
|
||||
out = np.random.multivariate_normal(mean, cov, size)
|
||||
return _squeeze_output(out)
|
||||
|
||||
def entropy(self, mean=None, cov=1):
|
||||
"""
|
||||
Compute the differential entropy of the multivariate normal.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
%(_doc_default_callparams)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
%(_doc_callparams_note)s
|
||||
|
||||
Returns
|
||||
-------
|
||||
h : scalar
|
||||
Entropy of the multivariate normal distribution
|
||||
|
||||
"""
|
||||
dim, mean, cov = _process_parameters(None, mean, cov)
|
||||
return 1/2 * np.log(np.linalg.det(2 * np.pi * np.e * cov))
|
||||
|
||||
multivariate_normal = multivariate_normal_gen()
|
||||
|
||||
|
||||
class multivariate_normal_frozen(object):
|
||||
def __init__(self, mean=None, cov=1):
|
||||
"""
|
||||
Create a frozen multivariate normal distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mean : array_like, optional
|
||||
Mean of the distribution (default zero)
|
||||
cov : array_like, optional
|
||||
Covariance matrix of the distribution (default one)
|
||||
|
||||
Examples
|
||||
--------
|
||||
When called with the default parameters, this will create a 1D random
|
||||
variable with mean 0 and covariance 1:
|
||||
|
||||
>>> from scipy.stats import multivariate_normal
|
||||
>>> r = multivariate_normal()
|
||||
>>> r.mean
|
||||
array([ 0.])
|
||||
>>> r.cov
|
||||
array([[1.]])
|
||||
|
||||
"""
|
||||
self.dim, self.mean, self.cov = _process_parameters(None, mean, cov)
|
||||
self.prec_U, self._log_det_cov = _psd_pinv_decomposed_log_pdet(self.cov)
|
||||
|
||||
self._mnorm = multivariate_normal_gen()
|
||||
|
||||
def logpdf(self, x):
|
||||
x = _process_quantiles(x, self.dim)
|
||||
out = self._mnorm._logpdf(x, self.mean, self.prec_U, self._log_det_cov)
|
||||
return _squeeze_output(out)
|
||||
|
||||
def pdf(self, x):
|
||||
return np.exp(self.logpdf(x))
|
||||
|
||||
def rvs(self, size=1):
|
||||
return self._mnorm.rvs(self.mean, self.cov, size)
|
||||
|
||||
def entropy(self):
|
||||
"""
|
||||
Computes the differential entropy of the multivariate normal.
|
||||
|
||||
Returns
|
||||
-------
|
||||
h : scalar
|
||||
Entropy of the multivariate normal distribution
|
||||
|
||||
"""
|
||||
return 1/2 * (self.dim * (_LOG_2PI + 1) + self._log_det_cov)
|
||||
|
||||
|
||||
# Set frozen generator docstrings from corresponding docstrings in
|
||||
# multivariate_normal_gen and fill in default strings in class docstrings
|
||||
for name in ['logpdf', 'pdf', 'rvs']:
|
||||
method = multivariate_normal_gen.__dict__[name]
|
||||
method_frozen = multivariate_normal_frozen.__dict__[name]
|
||||
method_frozen.__doc__ = doccer.docformat(method.__doc__, docdict_noparams)
|
||||
method.__doc__ = doccer.docformat(method.__doc__, docdict_params)
|
@ -0,0 +1,201 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
from numpy import poly1d
|
||||
from scipy.special import beta
|
||||
|
||||
|
||||
# The following code was used to generate the Pade coefficients for the
|
||||
# Tukey Lambda variance function. Version 0.17 of mpmath was used.
|
||||
#---------------------------------------------------------------------------
|
||||
# import mpmath as mp
|
||||
#
|
||||
# mp.mp.dps = 60
|
||||
#
|
||||
# one = mp.mpf(1)
|
||||
# two = mp.mpf(2)
|
||||
#
|
||||
# def mpvar(lam):
|
||||
# if lam == 0:
|
||||
# v = mp.pi**2 / three
|
||||
# else:
|
||||
# v = (two / lam**2) * (one / (one + two*lam) -
|
||||
# mp.beta(lam + one, lam + one))
|
||||
# return v
|
||||
#
|
||||
# t = mp.taylor(mpvar, 0, 8)
|
||||
# p, q = mp.pade(t, 4, 4)
|
||||
# print "p =", [mp.fp.mpf(c) for c in p]
|
||||
# print "q =", [mp.fp.mpf(c) for c in q]
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
# Pade coefficients for the Tukey Lambda variance function.
|
||||
_tukeylambda_var_pc = [3.289868133696453, 0.7306125098871127,
|
||||
-0.5370742306855439, 0.17292046290190008,
|
||||
-0.02371146284628187]
|
||||
_tukeylambda_var_qc = [1.0, 3.683605511659861, 4.184152498888124,
|
||||
1.7660926747377275, 0.2643989311168465]
|
||||
|
||||
# numpy.poly1d instances for the numerator and denominator of the
|
||||
# Pade approximation to the Tukey Lambda variance.
|
||||
_tukeylambda_var_p = poly1d(_tukeylambda_var_pc[::-1])
|
||||
_tukeylambda_var_q = poly1d(_tukeylambda_var_qc[::-1])
|
||||
|
||||
|
||||
def tukeylambda_variance(lam):
|
||||
"""Variance of the Tukey Lambda distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lam : array_like
|
||||
The lambda values at which to compute the variance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
v : ndarray
|
||||
The variance. For lam < -0.5, the variance is not defined, so
|
||||
np.nan is returned. For lam = 0.5, np.inf is returned.
|
||||
|
||||
Notes
|
||||
-----
|
||||
In an interval around lambda=0, this function uses the [4,4] Pade
|
||||
approximation to compute the variance. Otherwise it uses the standard
|
||||
formula (http://en.wikipedia.org/wiki/Tukey_lambda_distribution). The
|
||||
Pade approximation is used because the standard formula has a removable
|
||||
discontinuity at lambda = 0, and does not produce accurate numerical
|
||||
results near lambda = 0.
|
||||
"""
|
||||
lam = np.asarray(lam)
|
||||
shp = lam.shape
|
||||
lam = np.atleast_1d(lam).astype(np.float64)
|
||||
|
||||
# For absolute values of lam less than threshold, use the Pade
|
||||
# approximation.
|
||||
threshold = 0.075
|
||||
|
||||
# Play games with masks to implement the conditional evaluation of
|
||||
# the distribution.
|
||||
# lambda < -0.5: var = nan
|
||||
low_mask = lam < -0.5
|
||||
# lambda == -0.5: var = inf
|
||||
neghalf_mask = lam == -0.5
|
||||
# abs(lambda) < threshold: use Pade approximation
|
||||
small_mask = np.abs(lam) < threshold
|
||||
# else the "regular" case: use the explicit formula.
|
||||
reg_mask = ~(low_mask | neghalf_mask | small_mask)
|
||||
|
||||
# Get the 'lam' values for the cases where they are needed.
|
||||
small = lam[small_mask]
|
||||
reg = lam[reg_mask]
|
||||
|
||||
# Compute the function for each case.
|
||||
v = np.empty_like(lam)
|
||||
v[low_mask] = np.nan
|
||||
v[neghalf_mask] = np.inf
|
||||
if small.size > 0:
|
||||
# Use the Pade approximation near lambda = 0.
|
||||
v[small_mask] = _tukeylambda_var_p(small) / _tukeylambda_var_q(small)
|
||||
if reg.size > 0:
|
||||
v[reg_mask] = (2.0 / reg**2) * (1.0 / (1.0 + 2 * reg) -
|
||||
beta(reg + 1, reg + 1))
|
||||
v.shape = shp
|
||||
return v
|
||||
|
||||
|
||||
# The following code was used to generate the Pade coefficients for the
|
||||
# Tukey Lambda kurtosis function. Version 0.17 of mpmath was used.
|
||||
#---------------------------------------------------------------------------
|
||||
# import mpmath as mp
|
||||
#
|
||||
# mp.mp.dps = 60
|
||||
#
|
||||
# one = mp.mpf(1)
|
||||
# two = mp.mpf(2)
|
||||
# three = mp.mpf(3)
|
||||
# four = mp.mpf(4)
|
||||
#
|
||||
# def mpkurt(lam):
|
||||
# if lam == 0:
|
||||
# k = mp.mpf(6)/5
|
||||
# else:
|
||||
# numer = (one/(four*lam+one) - four*mp.beta(three*lam+one, lam+one) +
|
||||
# three*mp.beta(two*lam+one, two*lam+one))
|
||||
# denom = two*(one/(two*lam+one) - mp.beta(lam+one,lam+one))**2
|
||||
# k = numer / denom - three
|
||||
# return k
|
||||
#
|
||||
# # There is a bug in mpmath 0.17: when we use the 'method' keyword of the
|
||||
# # taylor function and we request a degree 9 Taylor polynomial, we actually
|
||||
# # get degree 8.
|
||||
# t = mp.taylor(mpkurt, 0, 9, method='quad', radius=0.01)
|
||||
# t = [mp.chop(c, tol=1e-15) for c in t]
|
||||
# p, q = mp.pade(t, 4, 4)
|
||||
# print "p =", [mp.fp.mpf(c) for c in p]
|
||||
# print "q =", [mp.fp.mpf(c) for c in q]
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
# Pade coefficients for the Tukey Lambda kurtosis function.
|
||||
_tukeylambda_kurt_pc = [1.2, -5.853465139719495, -22.653447381131077,
|
||||
0.20601184383406815, 4.59796302262789]
|
||||
_tukeylambda_kurt_qc = [1.0, 7.171149192233599, 12.96663094361842,
|
||||
0.43075235247853005, -2.789746758009912]
|
||||
|
||||
# numpy.poly1d instances for the numerator and denominator of the
|
||||
# Pade approximation to the Tukey Lambda kurtosis.
|
||||
_tukeylambda_kurt_p = poly1d(_tukeylambda_kurt_pc[::-1])
|
||||
_tukeylambda_kurt_q = poly1d(_tukeylambda_kurt_qc[::-1])
|
||||
|
||||
|
||||
def tukeylambda_kurtosis(lam):
|
||||
"""Kurtosis of the Tukey Lambda distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lam : array_like
|
||||
The lambda values at which to compute the variance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
v : ndarray
|
||||
The variance. For lam < -0.25, the variance is not defined, so
|
||||
np.nan is returned. For lam = 0.25, np.inf is returned.
|
||||
|
||||
"""
|
||||
lam = np.asarray(lam)
|
||||
shp = lam.shape
|
||||
lam = np.atleast_1d(lam).astype(np.float64)
|
||||
|
||||
# For absolute values of lam less than threshold, use the Pade
|
||||
# approximation.
|
||||
threshold = 0.055
|
||||
|
||||
# Use masks to implement the conditional evaluation of the kurtosis.
|
||||
# lambda < -0.25: kurtosis = nan
|
||||
low_mask = lam < -0.25
|
||||
# lambda == -0.25: kurtosis = inf
|
||||
negqrtr_mask = lam == -0.25
|
||||
# lambda near 0: use Pade approximation
|
||||
small_mask = np.abs(lam) < threshold
|
||||
# else the "regular" case: use the explicit formula.
|
||||
reg_mask = ~(low_mask | negqrtr_mask | small_mask)
|
||||
|
||||
# Get the 'lam' values for the cases where they are needed.
|
||||
small = lam[small_mask]
|
||||
reg = lam[reg_mask]
|
||||
|
||||
# Compute the function for each case.
|
||||
k = np.empty_like(lam)
|
||||
k[low_mask] = np.nan
|
||||
k[negqrtr_mask] = np.inf
|
||||
if small.size > 0:
|
||||
k[small_mask] = _tukeylambda_kurt_p(small) / _tukeylambda_kurt_q(small)
|
||||
if reg.size > 0:
|
||||
numer = (1.0 / (4 * reg + 1) - 4 * beta(3 * reg + 1, reg + 1) +
|
||||
3 * beta(2 * reg + 1, 2 * reg + 1))
|
||||
denom = 2 * (1.0/(2 * reg + 1) - beta(reg + 1, reg + 1))**2
|
||||
k[reg_mask] = numer / denom - 3
|
||||
|
||||
# The return value will be a numpy array; resetting the shape ensures that
|
||||
# if `lam` was a scalar, the return value is a 0-d array.
|
||||
k.shape = shp
|
||||
return k
|
@ -0,0 +1,271 @@
|
||||
"""Some functions for working with contingency tables (i.e. cross tabulations).
|
||||
"""
|
||||
|
||||
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from functools import reduce
|
||||
import numpy as np
|
||||
from .stats import power_divergence
|
||||
|
||||
|
||||
__all__ = ['margins', 'expected_freq', 'chi2_contingency']
|
||||
|
||||
|
||||
def margins(a):
|
||||
"""Return a list of the marginal sums of the array `a`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : ndarray
|
||||
The array for which to compute the marginal sums.
|
||||
|
||||
Returns
|
||||
-------
|
||||
margsums : list of ndarrays
|
||||
A list of length `a.ndim`. `margsums[k]` is the result
|
||||
of summing `a` over all axes except `k`; it has the same
|
||||
number of dimensions as `a`, but the length of each axis
|
||||
except axis `k` will be 1.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> a = np.arange(12).reshape(2, 6)
|
||||
>>> a
|
||||
array([[ 0, 1, 2, 3, 4, 5],
|
||||
[ 6, 7, 8, 9, 10, 11]])
|
||||
>>> m0, m1 = margins(a)
|
||||
>>> m0
|
||||
array([[15],
|
||||
[51]])
|
||||
>>> m1
|
||||
array([[ 6, 8, 10, 12, 14, 16]])
|
||||
|
||||
>>> b = np.arange(24).reshape(2,3,4)
|
||||
>>> m0, m1, m2 = margins(b)
|
||||
>>> m0
|
||||
array([[[ 66]],
|
||||
[[210]]])
|
||||
>>> m1
|
||||
array([[[ 60],
|
||||
[ 92],
|
||||
[124]]])
|
||||
>>> m2
|
||||
array([[[60, 66, 72, 78]]])
|
||||
"""
|
||||
margsums = []
|
||||
ranged = list(range(a.ndim))
|
||||
for k in ranged:
|
||||
marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
|
||||
margsums.append(marg)
|
||||
return margsums
|
||||
|
||||
|
||||
def expected_freq(observed):
|
||||
"""
|
||||
Compute the expected frequencies from a contingency table.
|
||||
|
||||
Given an n-dimensional contingency table of observed frequencies,
|
||||
compute the expected frequencies for the table based on the marginal
|
||||
sums under the assumption that the groups associated with each
|
||||
dimension are independent.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
observed : array_like
|
||||
The table of observed frequencies. (While this function can handle
|
||||
a 1-D array, that case is trivial. Generally `observed` is at
|
||||
least 2-D.)
|
||||
|
||||
Returns
|
||||
-------
|
||||
expected : ndarray of float64
|
||||
The expected frequencies, based on the marginal sums of the table.
|
||||
Same shape as `observed`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> observed = np.array([[10, 10, 20],[20, 20, 20]])
|
||||
>>> expected_freq(observed)
|
||||
array([[ 12., 12., 16.],
|
||||
[ 18., 18., 24.]])
|
||||
|
||||
"""
|
||||
# Typically `observed` is an integer array. If `observed` has a large
|
||||
# number of dimensions or holds large values, some of the following
|
||||
# computations may overflow, so we first switch to floating point.
|
||||
observed = np.asarray(observed, dtype=np.float64)
|
||||
|
||||
# Create a list of the marginal sums.
|
||||
margsums = margins(observed)
|
||||
|
||||
# Create the array of expected frequencies. The shapes of the
|
||||
# marginal sums returned by apply_over_axes() are just what we
|
||||
# need for broadcasting in the following product.
|
||||
d = observed.ndim
|
||||
expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
|
||||
return expected
|
||||
|
||||
|
||||
def chi2_contingency(observed, correction=True, lambda_=None):
|
||||
"""Chi-square test of independence of variables in a contingency table.
|
||||
|
||||
This function computes the chi-square statistic and p-value for the
|
||||
hypothesis test of independence of the observed frequencies in the
|
||||
contingency table [1]_ `observed`. The expected frequencies are computed
|
||||
based on the marginal sums under the assumption of independence; see
|
||||
`scipy.stats.contingency.expected_freq`. The number of degrees of
|
||||
freedom is (expressed using numpy functions and attributes)::
|
||||
|
||||
dof = observed.size - sum(observed.shape) + observed.ndim - 1
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
observed : array_like
|
||||
The contingency table. The table contains the observed frequencies
|
||||
(i.e. number of occurrences) in each category. In the two-dimensional
|
||||
case, the table is often described as an "R x C table".
|
||||
correction : bool, optional
|
||||
If True, *and* the degrees of freedom is 1, apply Yates' correction
|
||||
for continuity. The effect of the correction is to adjust each
|
||||
observed value by 0.5 towards the corresponding expected value.
|
||||
lambda_ : float or str, optional.
|
||||
By default, the statistic computed in this test is Pearson's
|
||||
chi-squared statistic [2]_. `lambda_` allows a statistic from the
|
||||
Cressie-Read power divergence family [3]_ to be used instead. See
|
||||
`power_divergence` for details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
chi2 : float
|
||||
The test statistic.
|
||||
p : float
|
||||
The p-value of the test
|
||||
dof : int
|
||||
Degrees of freedom
|
||||
expected : ndarray, same shape as `observed`
|
||||
The expected frequencies, based on the marginal sums of the table.
|
||||
|
||||
See Also
|
||||
--------
|
||||
contingency.expected_freq
|
||||
fisher_exact
|
||||
chisquare
|
||||
power_divergence
|
||||
|
||||
Notes
|
||||
-----
|
||||
An often quoted guideline for the validity of this calculation is that
|
||||
the test should be used only if the observed and expected frequency in
|
||||
each cell is at least 5.
|
||||
|
||||
This is a test for the independence of different categories of a
|
||||
population. The test is only meaningful when the dimension of
|
||||
`observed` is two or more. Applying the test to a one-dimensional
|
||||
table will always result in `expected` equal to `observed` and a
|
||||
chi-square statistic equal to 0.
|
||||
|
||||
This function does not handle masked arrays, because the calculation
|
||||
does not make sense with missing values.
|
||||
|
||||
Like stats.chisquare, this function computes a chi-square statistic;
|
||||
the convenience this function provides is to figure out the expected
|
||||
frequencies and degrees of freedom from the given contingency table.
|
||||
If these were already known, and if the Yates' correction was not
|
||||
required, one could use stats.chisquare. That is, if one calls::
|
||||
|
||||
chi2, p, dof, ex = chi2_contingency(obs, correction=False)
|
||||
|
||||
then the following is true::
|
||||
|
||||
(chi2, p) == stats.chisquare(obs.ravel(), f_exp=ex.ravel(),
|
||||
ddof=obs.size - 1 - dof)
|
||||
|
||||
The `lambda_` argument was added in version 0.13.0 of scipy.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
|
||||
.. [2] "Pearson's chi-squared test",
|
||||
http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
|
||||
.. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
|
||||
Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
|
||||
pp. 440-464.
|
||||
|
||||
Examples
|
||||
--------
|
||||
A two-way example (2 x 3):
|
||||
|
||||
>>> obs = np.array([[10, 10, 20], [20, 20, 20]])
|
||||
>>> chi2_contingency(obs)
|
||||
(2.7777777777777777,
|
||||
0.24935220877729619,
|
||||
2,
|
||||
array([[ 12., 12., 16.],
|
||||
[ 18., 18., 24.]]))
|
||||
|
||||
Perform the test using the log-likelihood ratio (i.e. the "G-test")
|
||||
instead of Pearson's chi-squared statistic.
|
||||
|
||||
>>> g, p, dof, expctd = chi2_contingency(obs, lambda_="log-likelihood")
|
||||
>>> g, p
|
||||
(2.7688587616781319, 0.25046668010954165)
|
||||
|
||||
A four-way example (2 x 2 x 2 x 2):
|
||||
|
||||
>>> obs = np.array(
|
||||
... [[[[12, 17],
|
||||
... [11, 16]],
|
||||
... [[11, 12],
|
||||
... [15, 16]]],
|
||||
... [[[23, 15],
|
||||
... [30, 22]],
|
||||
... [[14, 17],
|
||||
... [15, 16]]]])
|
||||
>>> chi2_contingency(obs)
|
||||
(8.7584514426741897,
|
||||
0.64417725029295503,
|
||||
11,
|
||||
array([[[[ 14.15462386, 14.15462386],
|
||||
[ 16.49423111, 16.49423111]],
|
||||
[[ 11.2461395 , 11.2461395 ],
|
||||
[ 13.10500554, 13.10500554]]],
|
||||
[[[ 19.5591166 , 19.5591166 ],
|
||||
[ 22.79202844, 22.79202844]],
|
||||
[[ 15.54012004, 15.54012004],
|
||||
[ 18.10873492, 18.10873492]]]]))
|
||||
"""
|
||||
observed = np.asarray(observed)
|
||||
if np.any(observed < 0):
|
||||
raise ValueError("All values in `observed` must be nonnegative.")
|
||||
if observed.size == 0:
|
||||
raise ValueError("No data; `observed` has size 0.")
|
||||
|
||||
expected = expected_freq(observed)
|
||||
if np.any(expected == 0):
|
||||
# Include one of the positions where expected is zero in
|
||||
# the exception message.
|
||||
zeropos = list(np.where(expected == 0)[0])
|
||||
raise ValueError("The internally computed table of expected "
|
||||
"frequencies has a zero element at %s." % zeropos)
|
||||
|
||||
# The degrees of freedom
|
||||
dof = expected.size - sum(expected.shape) + expected.ndim - 1
|
||||
|
||||
if dof == 0:
|
||||
# Degenerate case; this occurs when `observed` is 1D (or, more
|
||||
# generally, when it has only one nontrivial dimension). In this
|
||||
# case, we also have observed == expected, so chi2 is 0.
|
||||
chi2 = 0.0
|
||||
p = 1.0
|
||||
else:
|
||||
if dof == 1 and correction:
|
||||
# Adjust `observed` according to Yates' correction for continuity.
|
||||
observed = observed + 0.5 * np.sign(expected - observed)
|
||||
|
||||
chi2, p = power_divergence(observed, expected,
|
||||
ddof=observed.size - 1 - dof, axis=None,
|
||||
lambda_=lambda_)
|
||||
|
||||
return chi2, p, dof, expected
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,513 @@
|
||||
#-------------------------------------------------------------------------------
|
||||
#
|
||||
# Define classes for (uni/multi)-variate kernel density estimation.
|
||||
#
|
||||
# Currently, only Gaussian kernels are implemented.
|
||||
#
|
||||
# Written by: Robert Kern
|
||||
#
|
||||
# Date: 2004-08-09
|
||||
#
|
||||
# Modified: 2005-02-10 by Robert Kern.
|
||||
# Contributed to Scipy
|
||||
# 2005-10-07 by Robert Kern.
|
||||
# Some fixes to match the new scipy_core
|
||||
#
|
||||
# Copyright 2004-2005 by Enthought, Inc.
|
||||
#
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
# Standard library imports.
|
||||
import warnings
|
||||
|
||||
# Scipy imports.
|
||||
from scipy.lib.six import callable, string_types
|
||||
from scipy import linalg, special
|
||||
|
||||
from numpy import atleast_2d, reshape, zeros, newaxis, dot, exp, pi, sqrt, \
|
||||
ravel, power, atleast_1d, squeeze, sum, transpose
|
||||
import numpy as np
|
||||
from numpy.random import randint, multivariate_normal
|
||||
|
||||
# Local imports.
|
||||
from . import mvn
|
||||
|
||||
|
||||
__all__ = ['gaussian_kde']
|
||||
|
||||
|
||||
class gaussian_kde(object):
|
||||
"""Representation of a kernel-density estimate using Gaussian kernels.
|
||||
|
||||
Kernel density estimation is a way to estimate the probability density
|
||||
function (PDF) of a random variable in a non-parametric way.
|
||||
`gaussian_kde` works for both uni-variate and multi-variate data. It
|
||||
includes automatic bandwidth determination. The estimation works best for
|
||||
a unimodal distribution; bimodal or multi-modal distributions tend to be
|
||||
oversmoothed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset : array_like
|
||||
Datapoints to estimate from. In case of univariate data this is a 1-D
|
||||
array, otherwise a 2-D array with shape (# of dims, # of data).
|
||||
bw_method : str, scalar or callable, optional
|
||||
The method used to calculate the estimator bandwidth. This can be
|
||||
'scott', 'silverman', a scalar constant or a callable. If a scalar,
|
||||
this will be used directly as `kde.factor`. If a callable, it should
|
||||
take a `gaussian_kde` instance as only parameter and return a scalar.
|
||||
If None (default), 'scott' is used. See Notes for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
dataset : ndarray
|
||||
The dataset with which `gaussian_kde` was initialized.
|
||||
d : int
|
||||
Number of dimensions.
|
||||
n : int
|
||||
Number of datapoints.
|
||||
factor : float
|
||||
The bandwidth factor, obtained from `kde.covariance_factor`, with which
|
||||
the covariance matrix is multiplied.
|
||||
covariance : ndarray
|
||||
The covariance matrix of `dataset`, scaled by the calculated bandwidth
|
||||
(`kde.factor`).
|
||||
inv_cov : ndarray
|
||||
The inverse of `covariance`.
|
||||
|
||||
Methods
|
||||
-------
|
||||
kde.evaluate(points) : ndarray
|
||||
Evaluate the estimated pdf on a provided set of points.
|
||||
kde(points) : ndarray
|
||||
Same as kde.evaluate(points)
|
||||
kde.integrate_gaussian(mean, cov) : float
|
||||
Multiply pdf with a specified Gaussian and integrate over the whole
|
||||
domain.
|
||||
kde.integrate_box_1d(low, high) : float
|
||||
Integrate pdf (1D only) between two bounds.
|
||||
kde.integrate_box(low_bounds, high_bounds) : float
|
||||
Integrate pdf over a rectangular space between low_bounds and
|
||||
high_bounds.
|
||||
kde.integrate_kde(other_kde) : float
|
||||
Integrate two kernel density estimates multiplied together.
|
||||
kde.resample(size=None) : ndarray
|
||||
Randomly sample a dataset from the estimated pdf.
|
||||
kde.set_bandwidth(bw_method='scott') : None
|
||||
Computes the bandwidth, i.e. the coefficient that multiplies the data
|
||||
covariance matrix to obtain the kernel covariance matrix.
|
||||
.. versionadded:: 0.11.0
|
||||
kde.covariance_factor : float
|
||||
Computes the coefficient (`kde.factor`) that multiplies the data
|
||||
covariance matrix to obtain the kernel covariance matrix.
|
||||
The default is `scotts_factor`. A subclass can overwrite this method
|
||||
to provide a different method, or set it through a call to
|
||||
`kde.set_bandwidth`.
|
||||
|
||||
|
||||
Notes
|
||||
-----
|
||||
Bandwidth selection strongly influences the estimate obtained from the KDE
|
||||
(much more so than the actual shape of the kernel). Bandwidth selection
|
||||
can be done by a "rule of thumb", by cross-validation, by "plug-in
|
||||
methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
|
||||
uses a rule of thumb, the default is Scott's Rule.
|
||||
|
||||
Scott's Rule [1]_, implemented as `scotts_factor`, is::
|
||||
|
||||
n**(-1./(d+4)),
|
||||
|
||||
with ``n`` the number of data points and ``d`` the number of dimensions.
|
||||
Silverman's Rule [2]_, implemented as `silverman_factor`, is::
|
||||
|
||||
n * (d + 2) / 4.)**(-1. / (d + 4)).
|
||||
|
||||
Good general descriptions of kernel density estimation can be found in [1]_
|
||||
and [2]_, the mathematics for this multi-dimensional implementation can be
|
||||
found in [1]_.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
|
||||
Visualization", John Wiley & Sons, New York, Chicester, 1992.
|
||||
.. [2] B.W. Silverman, "Density Estimation for Statistics and Data
|
||||
Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
|
||||
Chapman and Hall, London, 1986.
|
||||
.. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
|
||||
Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
|
||||
.. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
|
||||
conditional density estimation", Computational Statistics & Data
|
||||
Analysis, Vol. 36, pp. 279-298, 2001.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Generate some random two-dimensional data:
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> def measure(n):
|
||||
>>> "Measurement model, return two coupled measurements."
|
||||
>>> m1 = np.random.normal(size=n)
|
||||
>>> m2 = np.random.normal(scale=0.5, size=n)
|
||||
>>> return m1+m2, m1-m2
|
||||
|
||||
>>> m1, m2 = measure(2000)
|
||||
>>> xmin = m1.min()
|
||||
>>> xmax = m1.max()
|
||||
>>> ymin = m2.min()
|
||||
>>> ymax = m2.max()
|
||||
|
||||
Perform a kernel density estimate on the data:
|
||||
|
||||
>>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
|
||||
>>> positions = np.vstack([X.ravel(), Y.ravel()])
|
||||
>>> values = np.vstack([m1, m2])
|
||||
>>> kernel = stats.gaussian_kde(values)
|
||||
>>> Z = np.reshape(kernel(positions).T, X.shape)
|
||||
|
||||
Plot the results:
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111)
|
||||
>>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
|
||||
... extent=[xmin, xmax, ymin, ymax])
|
||||
>>> ax.plot(m1, m2, 'k.', markersize=2)
|
||||
>>> ax.set_xlim([xmin, xmax])
|
||||
>>> ax.set_ylim([ymin, ymax])
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
def __init__(self, dataset, bw_method=None):
|
||||
self.dataset = atleast_2d(dataset)
|
||||
if not self.dataset.size > 1:
|
||||
raise ValueError("`dataset` input should have multiple elements.")
|
||||
|
||||
self.d, self.n = self.dataset.shape
|
||||
self.set_bandwidth(bw_method=bw_method)
|
||||
|
||||
def evaluate(self, points):
|
||||
"""Evaluate the estimated pdf on a set of points.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
points : (# of dimensions, # of points)-array
|
||||
Alternatively, a (# of dimensions,) vector can be passed in and
|
||||
treated as a single point.
|
||||
|
||||
Returns
|
||||
-------
|
||||
values : (# of points,)-array
|
||||
The values at each point.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError : if the dimensionality of the input points is different than
|
||||
the dimensionality of the KDE.
|
||||
|
||||
"""
|
||||
points = atleast_2d(points)
|
||||
|
||||
d, m = points.shape
|
||||
if d != self.d:
|
||||
if d == 1 and m == self.d:
|
||||
# points was passed in as a row vector
|
||||
points = reshape(points, (self.d, 1))
|
||||
m = 1
|
||||
else:
|
||||
msg = "points have dimension %s, dataset has dimension %s" % (d,
|
||||
self.d)
|
||||
raise ValueError(msg)
|
||||
|
||||
result = zeros((m,), dtype=np.float)
|
||||
|
||||
if m >= self.n:
|
||||
# there are more points than data, so loop over data
|
||||
for i in range(self.n):
|
||||
diff = self.dataset[:, i, newaxis] - points
|
||||
tdiff = dot(self.inv_cov, diff)
|
||||
energy = sum(diff*tdiff,axis=0) / 2.0
|
||||
result = result + exp(-energy)
|
||||
else:
|
||||
# loop over points
|
||||
for i in range(m):
|
||||
diff = self.dataset - points[:, i, newaxis]
|
||||
tdiff = dot(self.inv_cov, diff)
|
||||
energy = sum(diff * tdiff, axis=0) / 2.0
|
||||
result[i] = sum(exp(-energy), axis=0)
|
||||
|
||||
result = result / self._norm_factor
|
||||
|
||||
return result
|
||||
|
||||
__call__ = evaluate
|
||||
|
||||
def integrate_gaussian(self, mean, cov):
|
||||
"""
|
||||
Multiply estimated density by a multivariate Gaussian and integrate
|
||||
over the whole space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mean : aray_like
|
||||
A 1-D array, specifying the mean of the Gaussian.
|
||||
cov : array_like
|
||||
A 2-D array, specifying the covariance matrix of the Gaussian.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : scalar
|
||||
The value of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError :
|
||||
If the mean or covariance of the input Gaussian differs from
|
||||
the KDE's dimensionality.
|
||||
|
||||
"""
|
||||
mean = atleast_1d(squeeze(mean))
|
||||
cov = atleast_2d(cov)
|
||||
|
||||
if mean.shape != (self.d,):
|
||||
raise ValueError("mean does not have dimension %s" % self.d)
|
||||
if cov.shape != (self.d, self.d):
|
||||
raise ValueError("covariance does not have dimension %s" % self.d)
|
||||
|
||||
# make mean a column vector
|
||||
mean = mean[:, newaxis]
|
||||
|
||||
sum_cov = self.covariance + cov
|
||||
|
||||
diff = self.dataset - mean
|
||||
tdiff = dot(linalg.inv(sum_cov), diff)
|
||||
|
||||
energies = sum(diff * tdiff, axis=0) / 2.0
|
||||
result = sum(exp(-energies), axis=0) / sqrt(linalg.det(2 * pi *
|
||||
sum_cov)) / self.n
|
||||
|
||||
return result
|
||||
|
||||
def integrate_box_1d(self, low, high):
|
||||
"""
|
||||
Computes the integral of a 1D pdf between two bounds.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low : scalar
|
||||
Lower bound of integration.
|
||||
high : scalar
|
||||
Upper bound of integration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the KDE is over more than one dimension.
|
||||
|
||||
"""
|
||||
if self.d != 1:
|
||||
raise ValueError("integrate_box_1d() only handles 1D pdfs")
|
||||
|
||||
stdev = ravel(sqrt(self.covariance))[0]
|
||||
|
||||
normalized_low = ravel((low - self.dataset) / stdev)
|
||||
normalized_high = ravel((high - self.dataset) / stdev)
|
||||
|
||||
value = np.mean(special.ndtr(normalized_high) -
|
||||
special.ndtr(normalized_low))
|
||||
return value
|
||||
|
||||
def integrate_box(self, low_bounds, high_bounds, maxpts=None):
|
||||
"""Computes the integral of a pdf over a rectangular interval.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low_bounds : array_like
|
||||
A 1-D array containing the lower bounds of integration.
|
||||
high_bounds : array_like
|
||||
A 1-D array containing the upper bounds of integration.
|
||||
maxpts : int, optional
|
||||
The maximum number of points to use for integration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
"""
|
||||
if maxpts is not None:
|
||||
extra_kwds = {'maxpts': maxpts}
|
||||
else:
|
||||
extra_kwds = {}
|
||||
|
||||
value, inform = mvn.mvnun(low_bounds, high_bounds, self.dataset,
|
||||
self.covariance, **extra_kwds)
|
||||
if inform:
|
||||
msg = ('An integral in mvn.mvnun requires more points than %s' %
|
||||
(self.d * 1000))
|
||||
warnings.warn(msg)
|
||||
|
||||
return value
|
||||
|
||||
def integrate_kde(self, other):
|
||||
"""
|
||||
Computes the integral of the product of this kernel density estimate
|
||||
with another.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : gaussian_kde instance
|
||||
The other kde.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the KDEs have different dimensionality.
|
||||
|
||||
"""
|
||||
if other.d != self.d:
|
||||
raise ValueError("KDEs are not the same dimensionality")
|
||||
|
||||
# we want to iterate over the smallest number of points
|
||||
if other.n < self.n:
|
||||
small = other
|
||||
large = self
|
||||
else:
|
||||
small = self
|
||||
large = other
|
||||
|
||||
sum_cov = small.covariance + large.covariance
|
||||
result = 0.0
|
||||
for i in range(small.n):
|
||||
mean = small.dataset[:, i, newaxis]
|
||||
diff = large.dataset - mean
|
||||
tdiff = dot(linalg.inv(sum_cov), diff)
|
||||
|
||||
energies = sum(diff * tdiff, axis=0) / 2.0
|
||||
result += sum(exp(-energies), axis=0)
|
||||
|
||||
result /= sqrt(linalg.det(2 * pi * sum_cov)) * large.n * small.n
|
||||
|
||||
return result
|
||||
|
||||
def resample(self, size=None):
|
||||
"""
|
||||
Randomly sample a dataset from the estimated pdf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size : int, optional
|
||||
The number of samples to draw. If not provided, then the size is
|
||||
the same as the underlying dataset.
|
||||
|
||||
Returns
|
||||
-------
|
||||
resample : (self.d, `size`) ndarray
|
||||
The sampled dataset.
|
||||
|
||||
"""
|
||||
if size is None:
|
||||
size = self.n
|
||||
|
||||
norm = transpose(multivariate_normal(zeros((self.d,), float),
|
||||
self.covariance, size=size))
|
||||
indices = randint(0, self.n, size=size)
|
||||
means = self.dataset[:, indices]
|
||||
|
||||
return means + norm
|
||||
|
||||
def scotts_factor(self):
|
||||
return power(self.n, -1./(self.d+4))
|
||||
|
||||
def silverman_factor(self):
|
||||
return power(self.n*(self.d+2.0)/4.0, -1./(self.d+4))
|
||||
|
||||
# Default method to calculate bandwidth, can be overwritten by subclass
|
||||
covariance_factor = scotts_factor
|
||||
|
||||
def set_bandwidth(self, bw_method=None):
|
||||
"""Compute the estimator bandwidth with given method.
|
||||
|
||||
The new bandwidth calculated after a call to `set_bandwidth` is used
|
||||
for subsequent evaluations of the estimated density.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bw_method : str, scalar or callable, optional
|
||||
The method used to calculate the estimator bandwidth. This can be
|
||||
'scott', 'silverman', a scalar constant or a callable. If a
|
||||
scalar, this will be used directly as `kde.factor`. If a callable,
|
||||
it should take a `gaussian_kde` instance as only parameter and
|
||||
return a scalar. If None (default), nothing happens; the current
|
||||
`kde.covariance_factor` method is kept.
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 0.11
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> x1 = np.array([-7, -5, 1, 4, 5.])
|
||||
>>> kde = stats.gaussian_kde(x1)
|
||||
>>> xs = np.linspace(-10, 10, num=50)
|
||||
>>> y1 = kde(xs)
|
||||
>>> kde.set_bandwidth(bw_method='silverman')
|
||||
>>> y2 = kde(xs)
|
||||
>>> kde.set_bandwidth(bw_method=kde.factor / 3.)
|
||||
>>> y3 = kde(xs)
|
||||
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111)
|
||||
>>> ax.plot(x1, np.ones(x1.shape) / (4. * x1.size), 'bo',
|
||||
... label='Data points (rescaled)')
|
||||
>>> ax.plot(xs, y1, label='Scott (default)')
|
||||
>>> ax.plot(xs, y2, label='Silverman')
|
||||
>>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
|
||||
>>> ax.legend()
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if bw_method is None:
|
||||
pass
|
||||
elif bw_method == 'scott':
|
||||
self.covariance_factor = self.scotts_factor
|
||||
elif bw_method == 'silverman':
|
||||
self.covariance_factor = self.silverman_factor
|
||||
elif np.isscalar(bw_method) and not isinstance(bw_method, string_types):
|
||||
self._bw_method = 'use constant'
|
||||
self.covariance_factor = lambda: bw_method
|
||||
elif callable(bw_method):
|
||||
self._bw_method = bw_method
|
||||
self.covariance_factor = lambda: self._bw_method(self)
|
||||
else:
|
||||
msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
|
||||
"or a callable."
|
||||
raise ValueError(msg)
|
||||
|
||||
self._compute_covariance()
|
||||
|
||||
def _compute_covariance(self):
|
||||
"""Computes the covariance matrix for each Gaussian kernel using
|
||||
covariance_factor().
|
||||
"""
|
||||
self.factor = self.covariance_factor()
|
||||
# Cache covariance and inverse covariance of the data
|
||||
if not hasattr(self, '_data_inv_cov'):
|
||||
self._data_covariance = atleast_2d(np.cov(self.dataset, rowvar=1,
|
||||
bias=False))
|
||||
self._data_inv_cov = linalg.inv(self._data_covariance)
|
||||
|
||||
self.covariance = self._data_covariance * self.factor**2
|
||||
self.inv_cov = self._data_inv_cov / self.factor**2
|
||||
self._norm_factor = sqrt(linalg.det(2*pi*self.covariance)) * self.n
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,82 @@
|
||||
"""
|
||||
===================================================================
|
||||
Statistical functions for masked arrays (:mod:`scipy.stats.mstats`)
|
||||
===================================================================
|
||||
|
||||
.. currentmodule:: scipy.stats.mstats
|
||||
|
||||
This module contains a large number of statistical functions that can
|
||||
be used with masked arrays.
|
||||
|
||||
Most of these functions are similar to those in scipy.stats but might
|
||||
have small differences in the API or in the algorithm used. Since this
|
||||
is a relatively new package, some API changes are still possible.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
argstoarray
|
||||
betai
|
||||
chisquare
|
||||
count_tied_groups
|
||||
describe
|
||||
f_oneway
|
||||
f_value_wilks_lambda
|
||||
find_repeats
|
||||
friedmanchisquare
|
||||
gmean
|
||||
hmean
|
||||
kendalltau
|
||||
kendalltau_seasonal
|
||||
kruskalwallis
|
||||
kruskalwallis
|
||||
ks_twosamp
|
||||
ks_twosamp
|
||||
kurtosis
|
||||
kurtosistest
|
||||
linregress
|
||||
mannwhitneyu
|
||||
plotting_positions
|
||||
mode
|
||||
moment
|
||||
mquantiles
|
||||
msign
|
||||
normaltest
|
||||
obrientransform
|
||||
pearsonr
|
||||
plotting_positions
|
||||
pointbiserialr
|
||||
rankdata
|
||||
scoreatpercentile
|
||||
sem
|
||||
signaltonoise
|
||||
skew
|
||||
skewtest
|
||||
spearmanr
|
||||
theilslopes
|
||||
threshold
|
||||
tmax
|
||||
tmean
|
||||
tmin
|
||||
trim
|
||||
trima
|
||||
trimboth
|
||||
trimmed_stde
|
||||
trimr
|
||||
trimtail
|
||||
tsem
|
||||
ttest_onesamp
|
||||
ttest_ind
|
||||
ttest_onesamp
|
||||
ttest_rel
|
||||
tvar
|
||||
variation
|
||||
winsorize
|
||||
zmap
|
||||
zscore
|
||||
|
||||
"""
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from .mstats_basic import *
|
||||
from .mstats_extras import *
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,466 @@
|
||||
"""
|
||||
Additional statistics functions, with support to MA.
|
||||
|
||||
:author: Pierre GF Gerard-Marchant
|
||||
:contact: pierregm_at_uga_edu
|
||||
:date: $Date: 2007-10-29 17:18:13 +0200 (Mon, 29 Oct 2007) $
|
||||
:version: $Id: morestats.py 3473 2007-10-29 15:18:13Z jarrod.millman $
|
||||
"""
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
__author__ = "Pierre GF Gerard-Marchant"
|
||||
__docformat__ = "restructuredtext en"
|
||||
|
||||
|
||||
__all__ = ['compare_medians_ms',
|
||||
'hdquantiles', 'hdmedian', 'hdquantiles_sd',
|
||||
'idealfourths',
|
||||
'median_cihs','mjci','mquantiles_cimj',
|
||||
'rsh',
|
||||
'trimmed_mean_ci',]
|
||||
|
||||
import numpy as np
|
||||
from numpy import float_, int_, ndarray
|
||||
|
||||
import numpy.ma as ma
|
||||
from numpy.ma import MaskedArray
|
||||
|
||||
from . import mstats_basic as mstats
|
||||
|
||||
from scipy.stats.distributions import norm, beta, t, binom
|
||||
|
||||
|
||||
#####--------------------------------------------------------------------------
|
||||
#---- --- Quantiles ---
|
||||
#####--------------------------------------------------------------------------
|
||||
def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
|
||||
"""
|
||||
Computes quantile estimates with the Harrell-Davis method.
|
||||
|
||||
The quantile estimates are calculated as a weighted linear combination
|
||||
of order statistics.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Data array.
|
||||
prob : sequence
|
||||
Sequence of quantiles to compute.
|
||||
axis : int
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
var : boolean
|
||||
Whether to return the variance of the estimate.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdquantiles : MaskedArray
|
||||
A (p,) array of quantiles (if `var` is False), or a (2,p) array of
|
||||
quantiles and variances (if `var` is True), where ``p`` is the
|
||||
number of quantiles.
|
||||
|
||||
"""
|
||||
def _hd_1D(data,prob,var):
|
||||
"Computes the HD quantiles for a 1D array. Returns nan for invalid data."
|
||||
xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
|
||||
# Don't use length here, in case we have a numpy scalar
|
||||
n = xsorted.size
|
||||
#.........
|
||||
hd = np.empty((2,len(prob)), float_)
|
||||
if n < 2:
|
||||
hd.flat = np.nan
|
||||
if var:
|
||||
return hd
|
||||
return hd[0]
|
||||
#.........
|
||||
v = np.arange(n+1) / float(n)
|
||||
betacdf = beta.cdf
|
||||
for (i,p) in enumerate(prob):
|
||||
_w = betacdf(v, (n+1)*p, (n+1)*(1-p))
|
||||
w = _w[1:] - _w[:-1]
|
||||
hd_mean = np.dot(w, xsorted)
|
||||
hd[0,i] = hd_mean
|
||||
#
|
||||
hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
|
||||
#
|
||||
hd[0, prob == 0] = xsorted[0]
|
||||
hd[0, prob == 1] = xsorted[-1]
|
||||
if var:
|
||||
hd[1, prob == 0] = hd[1, prob == 1] = np.nan
|
||||
return hd
|
||||
return hd[0]
|
||||
# Initialization & checks ---------
|
||||
data = ma.array(data, copy=False, dtype=float_)
|
||||
p = np.array(prob, copy=False, ndmin=1)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None) or (data.ndim == 1):
|
||||
result = _hd_1D(data, p, var)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
|
||||
#
|
||||
return ma.fix_invalid(result, copy=False)
|
||||
|
||||
#..............................................................................
|
||||
|
||||
|
||||
def hdmedian(data, axis=-1, var=False):
|
||||
"""
|
||||
Returns the Harrell-Davis estimate of the median along the given axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
axis : int
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
var : boolean
|
||||
Whether to return the variance of the estimate.
|
||||
|
||||
"""
|
||||
result = hdquantiles(data,[0.5], axis=axis, var=var)
|
||||
return result.squeeze()
|
||||
|
||||
|
||||
#..............................................................................
|
||||
def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
|
||||
"""
|
||||
The standard error of the Harrell-Davis quantile estimates by jackknife.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Data array.
|
||||
prob : sequence
|
||||
Sequence of quantiles to compute.
|
||||
axis : int
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdquantiles_sd : MaskedArray
|
||||
Standard error of the Harrell-Davis quantile estimates.
|
||||
|
||||
"""
|
||||
def _hdsd_1D(data,prob):
|
||||
"Computes the std error for 1D arrays."
|
||||
xsorted = np.sort(data.compressed())
|
||||
n = len(xsorted)
|
||||
#.........
|
||||
hdsd = np.empty(len(prob), float_)
|
||||
if n < 2:
|
||||
hdsd.flat = np.nan
|
||||
#.........
|
||||
vv = np.arange(n) / float(n-1)
|
||||
betacdf = beta.cdf
|
||||
#
|
||||
for (i,p) in enumerate(prob):
|
||||
_w = betacdf(vv, (n+1)*p, (n+1)*(1-p))
|
||||
w = _w[1:] - _w[:-1]
|
||||
mx_ = np.fromiter([np.dot(w,xsorted[np.r_[list(range(0,k)),
|
||||
list(range(k+1,n))].astype(int_)])
|
||||
for k in range(n)], dtype=float_)
|
||||
mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1)
|
||||
hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n))
|
||||
return hdsd
|
||||
# Initialization & checks ---------
|
||||
data = ma.array(data, copy=False, dtype=float_)
|
||||
p = np.array(prob, copy=False, ndmin=1)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
result = _hdsd_1D(data, p)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
|
||||
#
|
||||
return ma.fix_invalid(result, copy=False).ravel()
|
||||
|
||||
|
||||
#####--------------------------------------------------------------------------
|
||||
#---- --- Confidence intervals ---
|
||||
#####--------------------------------------------------------------------------
|
||||
|
||||
def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
|
||||
alpha=0.05, axis=None):
|
||||
"""
|
||||
Selected confidence interval of the trimmed mean along the given axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input data.
|
||||
limits : {None, tuple}, optional
|
||||
None or a two item tuple.
|
||||
Tuple of the percentages to cut on each side of the array, with respect
|
||||
to the number of unmasked data, as floats between 0. and 1. If ``n``
|
||||
is the number of unmasked data before trimming, then
|
||||
(``n`` * `limits[0]`)th smallest data and (``n`` * `limits[1]`)th
|
||||
largest data are masked. The total number of unmasked data after
|
||||
trimming is ``n`` * (1. - sum(`limits`)).
|
||||
The value of one limit can be set to None to indicate an open interval.
|
||||
|
||||
Defaults to (0.2, 0.2).
|
||||
inclusive : (2,) tuple of boolean, optional
|
||||
If relative==False, tuple indicating whether values exactly equal to
|
||||
the absolute limits are allowed.
|
||||
If relative==True, tuple indicating whether the number of data being
|
||||
masked on each side should be rounded (True) or truncated (False).
|
||||
|
||||
Defaults to (True, True).
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
|
||||
Defaults to 0.05.
|
||||
axis : int, optional
|
||||
Axis along which to cut. If None, uses a flattened version of `data`.
|
||||
|
||||
Defaults to None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
trimmed_mean_ci : (2,) ndarray
|
||||
The lower and upper confidence intervals of the trimmed data.
|
||||
|
||||
"""
|
||||
data = ma.array(data, copy=False)
|
||||
trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
|
||||
tmean = trimmed.mean(axis)
|
||||
tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
|
||||
df = trimmed.count(axis) - 1
|
||||
tppf = t.ppf(1-alpha/2.,df)
|
||||
return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
|
||||
|
||||
#..............................................................................
|
||||
|
||||
|
||||
def mjci(data, prob=[0.25,0.5,0.75], axis=None):
|
||||
"""
|
||||
Returns the Maritz-Jarrett estimators of the standard error of selected
|
||||
experimental quantiles of the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data: ndarray
|
||||
Data array.
|
||||
prob: sequence
|
||||
Sequence of quantiles to compute.
|
||||
axis : int
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
"""
|
||||
def _mjci_1D(data, p):
|
||||
data = np.sort(data.compressed())
|
||||
n = data.size
|
||||
prob = (np.array(p) * n + 0.5).astype(int_)
|
||||
betacdf = beta.cdf
|
||||
#
|
||||
mj = np.empty(len(prob), float_)
|
||||
x = np.arange(1,n+1, dtype=float_) / n
|
||||
y = x - 1./n
|
||||
for (i,m) in enumerate(prob):
|
||||
(m1,m2) = (m-1, n-m)
|
||||
W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
|
||||
C1 = np.dot(W,data)
|
||||
C2 = np.dot(W,data**2)
|
||||
mj[i] = np.sqrt(C2 - C1**2)
|
||||
return mj
|
||||
#
|
||||
data = ma.array(data, copy=False)
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim)
|
||||
p = np.array(prob, copy=False, ndmin=1)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
return _mjci_1D(data, p)
|
||||
else:
|
||||
return ma.apply_along_axis(_mjci_1D, axis, data, p)
|
||||
|
||||
#..............................................................................
|
||||
|
||||
|
||||
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
|
||||
"""
|
||||
Computes the alpha confidence interval for the selected quantiles of the
|
||||
data, with Maritz-Jarrett estimators.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
prob : sequence
|
||||
Sequence of quantiles to compute.
|
||||
alpha : float
|
||||
Confidence level of the intervals.
|
||||
axis : integer
|
||||
Axis along which to compute the quantiles.
|
||||
If None, use a flattened array.
|
||||
|
||||
"""
|
||||
alpha = min(alpha, 1-alpha)
|
||||
z = norm.ppf(1-alpha/2.)
|
||||
xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
|
||||
smj = mjci(data, prob, axis=axis)
|
||||
return (xq - z * smj, xq + z * smj)
|
||||
|
||||
|
||||
#.............................................................................
|
||||
def median_cihs(data, alpha=0.05, axis=None):
|
||||
"""
|
||||
Computes the alpha-level confidence interval for the median of the data.
|
||||
|
||||
Uses the Hettmasperger-Sheather method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input data. Masked values are discarded. The input should be 1D only,
|
||||
or `axis` should be set to None.
|
||||
alpha : float
|
||||
Confidence level of the intervals.
|
||||
axis : integer
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
median_cihs :
|
||||
Alpha level confidence interval.
|
||||
|
||||
"""
|
||||
def _cihs_1D(data, alpha):
|
||||
data = np.sort(data.compressed())
|
||||
n = len(data)
|
||||
alpha = min(alpha, 1-alpha)
|
||||
k = int(binom._ppf(alpha/2., n, 0.5))
|
||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||
if gk < 1-alpha:
|
||||
k -= 1
|
||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||
gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
|
||||
I = (gk - 1 + alpha)/(gk - gkk)
|
||||
lambd = (n-k) * I / float(k + (n-2*k)*I)
|
||||
lims = (lambd*data[k] + (1-lambd)*data[k-1],
|
||||
lambd*data[n-k-1] + (1-lambd)*data[n-k])
|
||||
return lims
|
||||
data = ma.rray(data, copy=False)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
result = _cihs_1D(data.compressed(), alpha)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
|
||||
#
|
||||
return result
|
||||
|
||||
#..............................................................................
|
||||
|
||||
|
||||
def compare_medians_ms(group_1, group_2, axis=None):
|
||||
"""
|
||||
Compares the medians from two independent groups along the given axis.
|
||||
|
||||
The comparison is performed using the McKean-Schrader estimate of the
|
||||
standard error of the medians.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
group_1 : array_like
|
||||
First dataset.
|
||||
group_2 : array_like
|
||||
Second dataset.
|
||||
axis : int, optional
|
||||
Axis along which the medians are estimated. If None, the arrays are
|
||||
flattened. If `axis` is not None, then `group_1` and `group_2`
|
||||
should have the same shape.
|
||||
|
||||
Returns
|
||||
-------
|
||||
compare_medians_ms : {float, ndarray}
|
||||
If `axis` is None, then returns a float, otherwise returns a 1-D
|
||||
ndarray of floats with a length equal to the length of `group_1`
|
||||
along `axis`.
|
||||
|
||||
"""
|
||||
(med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
|
||||
(std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
|
||||
mstats.stde_median(group_2, axis=axis))
|
||||
W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
|
||||
return 1 - norm.cdf(W)
|
||||
|
||||
|
||||
def idealfourths(data, axis=None):
|
||||
"""
|
||||
Returns an estimate of the lower and upper quartiles.
|
||||
|
||||
Uses the ideal fourths algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input array.
|
||||
axis : int, optional
|
||||
Axis along which the quartiles are estimated. If None, the arrays are
|
||||
flattened.
|
||||
|
||||
Returns
|
||||
-------
|
||||
idealfourths : {list of floats, masked array}
|
||||
Returns the two internal values that divide `data` into four parts
|
||||
using the ideal fourths algorithm either along the flattened array
|
||||
(if `axis` is None) or along `axis` of `data`.
|
||||
|
||||
"""
|
||||
def _idf(data):
|
||||
x = data.compressed()
|
||||
n = len(x)
|
||||
if n < 3:
|
||||
return [np.nan,np.nan]
|
||||
(j,h) = divmod(n/4. + 5/12.,1)
|
||||
j = int(j)
|
||||
qlo = (1-h)*x[j-1] + h*x[j]
|
||||
k = n - j
|
||||
qup = (1-h)*x[k] + h*x[k-1]
|
||||
return [qlo, qup]
|
||||
data = ma.sort(data, axis=axis).view(MaskedArray)
|
||||
if (axis is None):
|
||||
return _idf(data)
|
||||
else:
|
||||
return ma.apply_along_axis(_idf, axis, data)
|
||||
|
||||
|
||||
def rsh(data, points=None):
|
||||
"""
|
||||
Evaluates Rosenblatt's shifted histogram estimators for each point
|
||||
on the dataset 'data'.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : sequence
|
||||
Input data. Masked values are ignored.
|
||||
points : sequence
|
||||
Sequence of points where to evaluate Rosenblatt shifted histogram.
|
||||
If None, use the data.
|
||||
|
||||
"""
|
||||
data = ma.array(data, copy=False)
|
||||
if points is None:
|
||||
points = data
|
||||
else:
|
||||
points = np.array(points, copy=False, ndmin=1)
|
||||
if data.ndim != 1:
|
||||
raise AttributeError("The input array should be 1D only !")
|
||||
n = data.count()
|
||||
r = idealfourths(data, axis=None)
|
||||
h = 1.2 * (r[-1]-r[0]) / n**(1./5)
|
||||
nhi = (data[:,None] <= points[None,:] + h).sum(0)
|
||||
nlo = (data[:,None] < points[None,:] - h).sum(0)
|
||||
return (nhi-nlo) / (2.*n*h)
|
||||
|
||||
|
||||
###############################################################################
|
@ -0,0 +1,76 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from numpy import vectorize, deprecate
|
||||
from numpy.random import random_sample
|
||||
|
||||
__all__ = ['randwppf', 'randwcdf']
|
||||
|
||||
# XXX: Are these needed anymore?
|
||||
|
||||
#####################################
|
||||
# General purpose continuous
|
||||
######################################
|
||||
|
||||
|
||||
@deprecate(message="Deprecated in scipy 0.14.0, use "
|
||||
"distribution-specific rvs() method instead")
|
||||
def randwppf(ppf, args=(), size=None):
|
||||
"""
|
||||
returns an array of randomly distributed integers of a distribution
|
||||
whose percent point function (inverse of the CDF or quantile function)
|
||||
is given.
|
||||
|
||||
args is a tuple of extra arguments to the ppf function (i.e. shape,
|
||||
location, scale), and size is the size of the output. Note the ppf
|
||||
function must accept an array of q values to compute over.
|
||||
|
||||
"""
|
||||
U = random_sample(size=size)
|
||||
return ppf(*(U,)+args)
|
||||
|
||||
|
||||
@deprecate(message="Deprecated in scipy 0.14.0, use "
|
||||
"distribution-specific rvs() method instead")
|
||||
def randwcdf(cdf, mean=1.0, args=(), size=None):
|
||||
"""
|
||||
Returns an array of randomly distributed integers given a CDF.
|
||||
|
||||
Given a cumulative distribution function (CDF) returns an array of
|
||||
randomly distributed integers that would satisfy the CDF.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cdf : function
|
||||
CDF function that accepts a single value and `args`, and returns
|
||||
an single value.
|
||||
mean : float, optional
|
||||
The mean of the distribution which helps the solver. Defaults
|
||||
to 1.0.
|
||||
args : tuple, optional
|
||||
Extra arguments to the cdf function (i.e. shape, location, scale)
|
||||
size : {int, None}, optional
|
||||
Is the size of the output. If None, only 1 value will be returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
randwcdf : ndarray
|
||||
Array of random numbers.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Can use the ``scipy.stats.distributions.*.cdf`` functions for the
|
||||
`cdf` parameter.
|
||||
|
||||
"""
|
||||
import scipy.optimize as optimize
|
||||
|
||||
def _ppfopt(x, q, *nargs):
|
||||
newargs = (x,)+nargs
|
||||
return cdf(*newargs) - q
|
||||
|
||||
def _ppf(q, *nargs):
|
||||
return optimize.fsolve(_ppfopt, mean, args=(q,)+nargs)
|
||||
|
||||
_vppf = vectorize(_ppf)
|
||||
U = random_sample(size=size)
|
||||
return _vppf(*(U,)+args)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,154 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import inspect
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import numpy.testing as npt
|
||||
|
||||
#from scipy.lib._version import NumpyVersion
|
||||
from scipy import stats
|
||||
|
||||
|
||||
#NUMPY_BELOW_1_7 = NumpyVersion(np.__version__) < '1.7.0'
|
||||
NUMPY_BELOW_1_7 =np.__version__ < '1.7.0'
|
||||
|
||||
|
||||
def check_normalization(distfn, args, distname):
|
||||
norm_moment = distfn.moment(0, *args)
|
||||
npt.assert_allclose(norm_moment, 1.0)
|
||||
|
||||
# this is a temporary plug: either ncf or expect is problematic;
|
||||
# best be marked as a knownfail, but I've no clue how to do it.
|
||||
if distname == "ncf":
|
||||
atol, rtol = 1e-5, 0
|
||||
else:
|
||||
atol, rtol = 1e-7, 1e-7
|
||||
|
||||
normalization_expect = distfn.expect(lambda x: 1, args=args)
|
||||
npt.assert_allclose(normalization_expect, 1.0, atol=atol, rtol=rtol,
|
||||
err_msg=distname, verbose=True)
|
||||
|
||||
normalization_cdf = distfn.cdf(distfn.b, *args)
|
||||
npt.assert_allclose(normalization_cdf, 1.0)
|
||||
|
||||
|
||||
def check_moment(distfn, arg, m, v, msg):
|
||||
m1 = distfn.moment(1, *arg)
|
||||
m2 = distfn.moment(2, *arg)
|
||||
if not np.isinf(m):
|
||||
npt.assert_almost_equal(m1, m, decimal=10, err_msg=msg +
|
||||
' - 1st moment')
|
||||
else: # or np.isnan(m1),
|
||||
npt.assert_(np.isinf(m1),
|
||||
msg + ' - 1st moment -infinite, m1=%s' % str(m1))
|
||||
|
||||
if not np.isinf(v):
|
||||
npt.assert_almost_equal(m2 - m1 * m1, v, decimal=10, err_msg=msg +
|
||||
' - 2ndt moment')
|
||||
else: # or np.isnan(m2),
|
||||
npt.assert_(np.isinf(m2),
|
||||
msg + ' - 2nd moment -infinite, m2=%s' % str(m2))
|
||||
|
||||
|
||||
def check_mean_expect(distfn, arg, m, msg):
|
||||
if np.isfinite(m):
|
||||
m1 = distfn.expect(lambda x: x, arg)
|
||||
npt.assert_almost_equal(m1, m, decimal=5, err_msg=msg +
|
||||
' - 1st moment (expect)')
|
||||
|
||||
|
||||
def check_var_expect(distfn, arg, m, v, msg):
|
||||
if np.isfinite(v):
|
||||
m2 = distfn.expect(lambda x: x*x, arg)
|
||||
npt.assert_almost_equal(m2, v + m*m, decimal=5, err_msg=msg +
|
||||
' - 2st moment (expect)')
|
||||
|
||||
|
||||
def check_skew_expect(distfn, arg, m, v, s, msg):
|
||||
if np.isfinite(s):
|
||||
m3e = distfn.expect(lambda x: np.power(x-m, 3), arg)
|
||||
npt.assert_almost_equal(m3e, s * np.power(v, 1.5),
|
||||
decimal=5, err_msg=msg + ' - skew')
|
||||
else:
|
||||
npt.assert_(np.isnan(s))
|
||||
|
||||
|
||||
def check_kurt_expect(distfn, arg, m, v, k, msg):
|
||||
if np.isfinite(k):
|
||||
m4e = distfn.expect(lambda x: np.power(x-m, 4), arg)
|
||||
npt.assert_allclose(m4e, (k + 3.) * np.power(v, 2), atol=1e-5, rtol=1e-5,
|
||||
err_msg=msg + ' - kurtosis')
|
||||
else:
|
||||
npt.assert_(np.isnan(k))
|
||||
|
||||
|
||||
def check_entropy(distfn, arg, msg):
|
||||
ent = distfn.entropy(*arg)
|
||||
npt.assert_(not np.isnan(ent), msg + 'test Entropy is nan')
|
||||
|
||||
|
||||
def check_private_entropy(distfn, args, superclass):
|
||||
# compare a generic _entropy with the distribution-specific implementation
|
||||
npt.assert_allclose(distfn._entropy(*args),
|
||||
superclass._entropy(distfn, *args))
|
||||
|
||||
|
||||
def check_edge_support(distfn, args):
|
||||
# Make sure the x=self.a and self.b are handled correctly.
|
||||
x = [distfn.a, distfn.b]
|
||||
if isinstance(distfn, stats.rv_continuous):
|
||||
npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0])
|
||||
npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0])
|
||||
|
||||
npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0])
|
||||
npt.assert_equal(distfn.logsf(x, *args), [0.0, -np.inf])
|
||||
|
||||
if isinstance(distfn, stats.rv_discrete):
|
||||
x = [distfn.a - 1, distfn.b]
|
||||
npt.assert_equal(distfn.ppf([0.0, 1.0], *args), x)
|
||||
npt.assert_equal(distfn.isf([0.0, 1.0], *args), x[::-1])
|
||||
|
||||
# out-of-bounds for isf & ppf
|
||||
npt.assert_(np.isnan(distfn.isf([-1, 2], *args)).all())
|
||||
npt.assert_(np.isnan(distfn.ppf([-1, 2], *args)).all())
|
||||
|
||||
|
||||
def check_named_args(distfn, x, shape_args, defaults, meths):
|
||||
## Check calling w/ named arguments.
|
||||
|
||||
# check consistency of shapes, numargs and _parse signature
|
||||
signature = inspect.getargspec(distfn._parse_args)
|
||||
npt.assert_(signature.varargs is None)
|
||||
npt.assert_(signature.keywords is None)
|
||||
npt.assert_(signature.defaults == defaults)
|
||||
|
||||
shape_argnames = signature.args[1:-len(defaults)] # self, a, b, loc=0, scale=1
|
||||
if distfn.shapes:
|
||||
shapes_ = distfn.shapes.replace(',', ' ').split()
|
||||
else:
|
||||
shapes_ = ''
|
||||
npt.assert_(len(shapes_) == distfn.numargs)
|
||||
npt.assert_(len(shapes_) == len(shape_argnames))
|
||||
|
||||
# check calling w/ named arguments
|
||||
shape_args = list(shape_args)
|
||||
|
||||
vals = [meth(x, *shape_args) for meth in meths]
|
||||
npt.assert_(np.all(np.isfinite(vals)))
|
||||
|
||||
names, a, k = shape_argnames[:], shape_args[:], {}
|
||||
while names:
|
||||
k.update({names.pop(): a.pop()})
|
||||
v = [meth(x, *a, **k) for meth in meths]
|
||||
npt.assert_array_equal(vals, v)
|
||||
if not 'n' in k.keys():
|
||||
# `n` is first parameter of moment(), so can't be used as named arg
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", UserWarning)
|
||||
npt.assert_equal(distfn.moment(1, *a, **k),
|
||||
distfn.moment(1, *shape_args))
|
||||
|
||||
# unknown arguments should not go through:
|
||||
k.update({'kaboom': 42})
|
||||
npt.assert_raises(TypeError, distfn.cdf, x, **k)
|
@ -0,0 +1,238 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_almost_equal, run_module_suite
|
||||
from scipy.stats import \
|
||||
binned_statistic, binned_statistic_2d, binned_statistic_dd
|
||||
|
||||
|
||||
class TestBinnedStatistic(object):
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
np.random.seed(9865)
|
||||
cls.x = np.random.random(100)
|
||||
cls.y = np.random.random(100)
|
||||
cls.v = np.random.random(100)
|
||||
cls.X = np.random.random((100, 3))
|
||||
|
||||
def test_1d_count(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
count1, edges1, bc = binned_statistic(x, v, 'count', bins=10)
|
||||
count2, edges2 = np.histogram(x, bins=10)
|
||||
|
||||
assert_array_almost_equal(count1, count2)
|
||||
assert_array_almost_equal(edges1, edges2)
|
||||
|
||||
def test_1d_sum(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
sum1, edges1, bc = binned_statistic(x, v, 'sum', bins=10)
|
||||
sum2, edges2 = np.histogram(x, bins=10, weights=v)
|
||||
|
||||
assert_array_almost_equal(sum1, sum2)
|
||||
assert_array_almost_equal(edges1, edges2)
|
||||
|
||||
def test_1d_mean(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic(x, v, 'mean', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, v, np.mean, bins=10)
|
||||
|
||||
assert_array_almost_equal(stat1, stat2)
|
||||
assert_array_almost_equal(edges1, edges2)
|
||||
|
||||
def test_1d_std(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic(x, v, 'std', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, v, np.std, bins=10)
|
||||
|
||||
assert_array_almost_equal(stat1, stat2)
|
||||
assert_array_almost_equal(edges1, edges2)
|
||||
|
||||
def test_1d_median(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic(x, v, 'median', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, v, np.median, bins=10)
|
||||
|
||||
assert_array_almost_equal(stat1, stat2)
|
||||
assert_array_almost_equal(edges1, edges2)
|
||||
|
||||
def test_1d_bincode(self):
|
||||
x = self.x[:20]
|
||||
v = self.v[:20]
|
||||
|
||||
count1, edges1, bc = binned_statistic(x, v, 'count', bins=3)
|
||||
bc2 = np.array([3, 2, 1, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 1, 2, 3, 1,
|
||||
1, 2, 1])
|
||||
|
||||
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
||||
|
||||
assert_array_almost_equal(bc, bc2)
|
||||
assert_array_almost_equal(bcount, count1)
|
||||
|
||||
def test_1d_range_keyword(self):
|
||||
# Regression test for gh-3063, range can be (min, max) or [(min, max)]
|
||||
np.random.seed(9865)
|
||||
x = np.arange(30)
|
||||
data = np.random.random(30)
|
||||
|
||||
mean, bins, _ = binned_statistic(x[:15], data[:15])
|
||||
mean_range, bins_range, _ = binned_statistic(x, data, range=[(0, 14)])
|
||||
mean_range2, bins_range2, _ = binned_statistic(x, data, range=(0, 14))
|
||||
|
||||
assert_array_almost_equal(mean, mean_range)
|
||||
assert_array_almost_equal(bins, bins_range)
|
||||
assert_array_almost_equal(mean, mean_range2)
|
||||
assert_array_almost_equal(bins, bins_range2)
|
||||
|
||||
def test_2d_count(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
count1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'count', bins=5)
|
||||
count2, binx2, biny2 = np.histogram2d(x, y, bins=5)
|
||||
|
||||
assert_array_almost_equal(count1, count2)
|
||||
assert_array_almost_equal(binx1, binx2)
|
||||
assert_array_almost_equal(biny1, biny2)
|
||||
|
||||
def test_2d_sum(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
sum1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'sum', bins=5)
|
||||
sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v)
|
||||
|
||||
assert_array_almost_equal(sum1, sum2)
|
||||
assert_array_almost_equal(binx1, binx2)
|
||||
assert_array_almost_equal(biny1, biny2)
|
||||
|
||||
def test_2d_mean(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'mean', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
|
||||
|
||||
assert_array_almost_equal(stat1, stat2)
|
||||
assert_array_almost_equal(binx1, binx2)
|
||||
assert_array_almost_equal(biny1, biny2)
|
||||
|
||||
def test_2d_std(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'std', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.std, bins=5)
|
||||
|
||||
assert_array_almost_equal(stat1, stat2)
|
||||
assert_array_almost_equal(binx1, binx2)
|
||||
assert_array_almost_equal(biny1, biny2)
|
||||
|
||||
def test_2d_median(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'median', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.median, bins=5)
|
||||
|
||||
assert_array_almost_equal(stat1, stat2)
|
||||
assert_array_almost_equal(binx1, binx2)
|
||||
assert_array_almost_equal(biny1, biny2)
|
||||
|
||||
def test_2d_bincode(self):
|
||||
x = self.x[:20]
|
||||
y = self.y[:20]
|
||||
v = self.v[:20]
|
||||
|
||||
count1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'count', bins=3)
|
||||
bc2 = np.array([17, 11, 6, 16, 11, 17, 18, 17, 17, 7, 6, 18, 16,
|
||||
6, 11, 16, 6, 6, 11, 8])
|
||||
|
||||
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
||||
|
||||
assert_array_almost_equal(bc, bc2)
|
||||
count1adj = count1[count1.nonzero()]
|
||||
assert_array_almost_equal(bcount, count1adj)
|
||||
|
||||
def test_dd_count(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
|
||||
count2, edges2 = np.histogramdd(X, bins=3)
|
||||
|
||||
assert_array_almost_equal(count1, count2)
|
||||
assert_array_almost_equal(edges1, edges2)
|
||||
|
||||
def test_dd_sum(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
sum1, edges1, bc = binned_statistic_dd(X, v, 'sum', bins=3)
|
||||
sum2, edges2 = np.histogramdd(X, bins=3, weights=v)
|
||||
|
||||
assert_array_almost_equal(sum1, sum2)
|
||||
assert_array_almost_equal(edges1, edges2)
|
||||
|
||||
def test_dd_mean(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'mean', bins=3)
|
||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.mean, bins=3)
|
||||
|
||||
assert_array_almost_equal(stat1, stat2)
|
||||
assert_array_almost_equal(edges1, edges2)
|
||||
|
||||
def test_dd_std(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'std', bins=3)
|
||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.std, bins=3)
|
||||
|
||||
assert_array_almost_equal(stat1, stat2)
|
||||
assert_array_almost_equal(edges1, edges2)
|
||||
|
||||
def test_dd_median(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'median', bins=3)
|
||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.median, bins=3)
|
||||
|
||||
assert_array_almost_equal(stat1, stat2)
|
||||
assert_array_almost_equal(edges1, edges2)
|
||||
|
||||
def test_dd_bincode(self):
|
||||
X = self.X[:20]
|
||||
v = self.v[:20]
|
||||
|
||||
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
|
||||
bc2 = np.array([63, 33, 86, 83, 88, 67, 57, 33, 42, 41, 82, 83, 92,
|
||||
32, 36, 91, 43, 87, 81, 81])
|
||||
|
||||
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
||||
|
||||
assert_array_almost_equal(bc, bc2)
|
||||
count1adj = count1[count1.nonzero()]
|
||||
assert_array_almost_equal(bcount, count1adj)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_module_suite()
|
@ -0,0 +1,202 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import (run_module_suite, assert_equal, assert_array_equal,
|
||||
assert_array_almost_equal, assert_approx_equal, assert_raises,
|
||||
assert_allclose)
|
||||
from scipy.special import xlogy
|
||||
from scipy.stats.contingency import margins, expected_freq, chi2_contingency
|
||||
|
||||
|
||||
def test_margins():
|
||||
a = np.array([1])
|
||||
m = margins(a)
|
||||
assert_equal(len(m), 1)
|
||||
m0 = m[0]
|
||||
assert_array_equal(m0, np.array([1]))
|
||||
|
||||
a = np.array([[1]])
|
||||
m0, m1 = margins(a)
|
||||
expected0 = np.array([[1]])
|
||||
expected1 = np.array([[1]])
|
||||
assert_array_equal(m0, expected0)
|
||||
assert_array_equal(m1, expected1)
|
||||
|
||||
a = np.arange(12).reshape(2, 6)
|
||||
m0, m1 = margins(a)
|
||||
expected0 = np.array([[15], [51]])
|
||||
expected1 = np.array([[6, 8, 10, 12, 14, 16]])
|
||||
assert_array_equal(m0, expected0)
|
||||
assert_array_equal(m1, expected1)
|
||||
|
||||
a = np.arange(24).reshape(2, 3, 4)
|
||||
m0, m1, m2 = margins(a)
|
||||
expected0 = np.array([[[66]], [[210]]])
|
||||
expected1 = np.array([[[60], [92], [124]]])
|
||||
expected2 = np.array([[[60, 66, 72, 78]]])
|
||||
assert_array_equal(m0, expected0)
|
||||
assert_array_equal(m1, expected1)
|
||||
assert_array_equal(m2, expected2)
|
||||
|
||||
|
||||
def test_expected_freq():
|
||||
assert_array_equal(expected_freq([1]), np.array([1.0]))
|
||||
|
||||
observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
|
||||
e = expected_freq(observed)
|
||||
assert_array_equal(e, np.ones_like(observed))
|
||||
|
||||
observed = np.array([[10, 10, 20], [20, 20, 20]])
|
||||
e = expected_freq(observed)
|
||||
correct = np.array([[12., 12., 16.], [18., 18., 24.]])
|
||||
assert_array_almost_equal(e, correct)
|
||||
|
||||
|
||||
def test_chi2_contingency_trivial():
|
||||
# Some very simple tests for chi2_contingency.
|
||||
|
||||
# A trivial case
|
||||
obs = np.array([[1, 2], [1, 2]])
|
||||
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
||||
assert_equal(chi2, 0.0)
|
||||
assert_equal(p, 1.0)
|
||||
assert_equal(dof, 1)
|
||||
assert_array_equal(obs, expected)
|
||||
|
||||
# A *really* trivial case: 1-D data.
|
||||
obs = np.array([1, 2, 3])
|
||||
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
||||
assert_equal(chi2, 0.0)
|
||||
assert_equal(p, 1.0)
|
||||
assert_equal(dof, 0)
|
||||
assert_array_equal(obs, expected)
|
||||
|
||||
|
||||
def test_chi2_contingency_R():
|
||||
# Some test cases that were computed independently, using R.
|
||||
|
||||
Rcode = \
|
||||
"""
|
||||
# Data vector.
|
||||
data <- c(
|
||||
12, 34, 23, 4, 47, 11,
|
||||
35, 31, 11, 34, 10, 18,
|
||||
12, 32, 9, 18, 13, 19,
|
||||
12, 12, 14, 9, 33, 25
|
||||
)
|
||||
|
||||
# Create factor tags:r=rows, c=columns, t=tiers
|
||||
r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
|
||||
c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
|
||||
t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
|
||||
|
||||
# 3-way Chi squared test of independence
|
||||
s = summary(xtabs(data~r+c+t))
|
||||
print(s)
|
||||
"""
|
||||
Routput = \
|
||||
"""
|
||||
Call: xtabs(formula = data ~ r + c + t)
|
||||
Number of cases in table: 478
|
||||
Number of factors: 3
|
||||
Test for independence of all factors:
|
||||
Chisq = 102.17, df = 17, p-value = 3.514e-14
|
||||
"""
|
||||
obs = np.array(
|
||||
[[[12, 34, 23],
|
||||
[35, 31, 11],
|
||||
[12, 32, 9],
|
||||
[12, 12, 14]],
|
||||
[[4, 47, 11],
|
||||
[34, 10, 18],
|
||||
[18, 13, 19],
|
||||
[9, 33, 25]]])
|
||||
chi2, p, dof, expected = chi2_contingency(obs)
|
||||
assert_approx_equal(chi2, 102.17, significant=5)
|
||||
assert_approx_equal(p, 3.514e-14, significant=4)
|
||||
assert_equal(dof, 17)
|
||||
|
||||
Rcode = \
|
||||
"""
|
||||
# Data vector.
|
||||
data <- c(
|
||||
#
|
||||
12, 17,
|
||||
11, 16,
|
||||
#
|
||||
11, 12,
|
||||
15, 16,
|
||||
#
|
||||
23, 15,
|
||||
30, 22,
|
||||
#
|
||||
14, 17,
|
||||
15, 16
|
||||
)
|
||||
|
||||
# Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
|
||||
r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
|
||||
c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
|
||||
d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
|
||||
t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
|
||||
|
||||
# 4-way Chi squared test of independence
|
||||
s = summary(xtabs(data~r+c+d+t))
|
||||
print(s)
|
||||
"""
|
||||
Routput = \
|
||||
"""
|
||||
Call: xtabs(formula = data ~ r + c + d + t)
|
||||
Number of cases in table: 262
|
||||
Number of factors: 4
|
||||
Test for independence of all factors:
|
||||
Chisq = 8.758, df = 11, p-value = 0.6442
|
||||
"""
|
||||
obs = np.array(
|
||||
[[[[12, 17],
|
||||
[11, 16]],
|
||||
[[11, 12],
|
||||
[15, 16]]],
|
||||
[[[23, 15],
|
||||
[30, 22]],
|
||||
[[14, 17],
|
||||
[15, 16]]]])
|
||||
chi2, p, dof, expected = chi2_contingency(obs)
|
||||
assert_approx_equal(chi2, 8.758, significant=4)
|
||||
assert_approx_equal(p, 0.6442, significant=4)
|
||||
assert_equal(dof, 11)
|
||||
|
||||
|
||||
def test_chi2_contingency_g():
|
||||
c = np.array([[15, 60], [15, 90]])
|
||||
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False)
|
||||
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
||||
|
||||
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True)
|
||||
c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
|
||||
assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
|
||||
|
||||
c = np.array([[10, 12, 10], [12, 10, 10]])
|
||||
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
|
||||
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
||||
|
||||
|
||||
def test_chi2_contingency_bad_args():
|
||||
# Test that "bad" inputs raise a ValueError.
|
||||
|
||||
# Negative value in the array of observed frequencies.
|
||||
obs = np.array([[-1, 10], [1, 2]])
|
||||
assert_raises(ValueError, chi2_contingency, obs)
|
||||
|
||||
# The zeros in this will result in zeros in the array
|
||||
# of expected frequencies.
|
||||
obs = np.array([[0, 1], [0, 1]])
|
||||
assert_raises(ValueError, chi2_contingency, obs)
|
||||
|
||||
# A degenerate case: `observed` has size 0.
|
||||
obs = np.empty((0, 8))
|
||||
assert_raises(ValueError, chi2_contingency, obs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_module_suite()
|
File diff suppressed because it is too large
Load Diff
@ -1,98 +1,100 @@
|
||||
# NOTE: contains only one test, _est_cont_fit, that is renamed so that
|
||||
# nose doesn't run it
|
||||
# I put this here for the record and for the case when someone wants to
|
||||
# verify the quality of fit
|
||||
# with current parameters: relatively small sample size, default starting values
|
||||
# Ran 84 tests in 401.797s
|
||||
# FAILED (failures=15)
|
||||
#
|
||||
#Ran 83 tests in 238.859s
|
||||
#FAILED (failures=12)
|
||||
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy.testing as npt
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import dec
|
||||
|
||||
from wafo import stats
|
||||
|
||||
from test_continuous_basic import distcont
|
||||
from .test_continuous_basic import distcont
|
||||
|
||||
# this is not a proper statistical test for convergence, but only
|
||||
# verifies that the estimate and true values don't differ by too much
|
||||
n_repl1 = 1000 # sample size for first run
|
||||
n_repl2 = 5000 # sample size for second run, if first run fails
|
||||
thresh_percent = 0.25 # percent of true parameters for fail cut-off
|
||||
thresh_min = 0.75 # minimum difference estimate - true to fail test
|
||||
|
||||
fit_sizes = [1000, 5000] # sample sizes to try
|
||||
thresh_percent = 0.25 # percent of true parameters for fail cut-off
|
||||
thresh_min = 0.75 # minimum difference estimate - true to fail test
|
||||
|
||||
distslow = [ 'ncx2', 'rdist', 'gausshyper', 'recipinvgauss', 'ksone', 'genexpon',
|
||||
'vonmises', 'rice', 'mielke',
|
||||
'powerlognorm', 'kstwobign', 'tukeylambda','betaprime', 'gengamma',
|
||||
'johnsonsb', 'burr', 'truncexpon', 'pearson3', 'exponweib', 'nakagami',
|
||||
'wrapcauchy']
|
||||
dist_rarely_fitted = ['f', 'ncf', 'nct', 'chi']
|
||||
distskip = distslow + dist_rarely_fitted
|
||||
|
||||
#distcont = [['genextreme', (3.3184017469423535,)]]
|
||||
#@npt.dec.slow
|
||||
failing_fits = [
|
||||
'burr',
|
||||
'chi',
|
||||
'chi2',
|
||||
'gausshyper',
|
||||
'genexpon',
|
||||
'gengamma',
|
||||
'ksone',
|
||||
'mielke',
|
||||
'ncf',
|
||||
'ncx2',
|
||||
'pearson3',
|
||||
'powerlognorm',
|
||||
'truncexpon',
|
||||
'tukeylambda',
|
||||
'vonmises',
|
||||
'wrapcauchy',
|
||||
]
|
||||
|
||||
# Don't run the fit test on these:
|
||||
skip_fit = [
|
||||
'erlang', # Subclass of gamma, generates a warning.
|
||||
]
|
||||
|
||||
|
||||
@dec.slow
|
||||
def test_cont_fit():
|
||||
# this tests the closeness of the estimated parameters to the true
|
||||
# parameters with fit method of continuous distributions
|
||||
for distname, arg in distcont:
|
||||
if distname not in distskip:
|
||||
yield check_cont_fit, distname,arg
|
||||
|
||||
@npt.dec.slow
|
||||
def _est_cont_fit_slow():
|
||||
# this tests the closeness of the estimated parameters to the true
|
||||
# parameters with fit method of continuous distributions
|
||||
# Note: is slow, some distributions don't converge with sample size <= 10000
|
||||
|
||||
for distname, arg in distcont:
|
||||
if distname in distslow:
|
||||
if distname not in skip_fit:
|
||||
yield check_cont_fit, distname,arg
|
||||
|
||||
def test_lognorm_fit_ticket1131():
|
||||
params = [(2.1, 1.,1.), (1.,10.,1.), (1.,1.,10.)]
|
||||
for param in params:
|
||||
yield check_cont_fit, 'lognorm', param
|
||||
|
||||
def check_cont_fit(distname,arg):
|
||||
distfn = getattr(stats, distname)
|
||||
rvs = distfn.rvs(size=n_repl1,*arg)
|
||||
est = distfn.fit(rvs) #, *arg) # start with default values
|
||||
n = distfn.numargs + 2
|
||||
truearg = np.hstack([arg,[0.0, 1.0]])[:n]
|
||||
if distname in failing_fits:
|
||||
# Skip failing fits unless overridden
|
||||
xfail = True
|
||||
try:
|
||||
xfail = not int(os.environ['SCIPY_XFAIL'])
|
||||
except:
|
||||
pass
|
||||
if xfail:
|
||||
msg = "Fitting %s doesn't work reliably yet" % distname
|
||||
msg += " [Set environment variable SCIPY_XFAIL=1 to run this test nevertheless.]"
|
||||
dec.knownfailureif(True, msg)(lambda: None)()
|
||||
|
||||
diff = est-truearg
|
||||
distfn = getattr(stats, distname)
|
||||
|
||||
txt = ''
|
||||
truearg = np.hstack([arg,[0.0,1.0]])
|
||||
diffthreshold = np.max(np.vstack([truearg*thresh_percent,
|
||||
np.ones(distfn.numargs+2)*thresh_min]),0)
|
||||
# threshold for location
|
||||
diffthreshold[-2] = np.max([np.abs(rvs.mean())*thresh_percent,thresh_min])
|
||||
np.ones(distfn.numargs+2)*thresh_min]),0)
|
||||
|
||||
for fit_size in fit_sizes:
|
||||
# Note that if a fit succeeds, the other fit_sizes are skipped
|
||||
np.random.seed(1234)
|
||||
|
||||
with np.errstate(all='ignore'):
|
||||
rvs = distfn.rvs(size=fit_size, *arg)
|
||||
est = distfn.fit(rvs) # start with default values
|
||||
|
||||
diff = est - truearg
|
||||
|
||||
# threshold for location
|
||||
diffthreshold[-2] = np.max([np.abs(rvs.mean())*thresh_percent,thresh_min])
|
||||
|
||||
if np.any(np.isnan(est)):
|
||||
raise AssertionError('nan returned in fit')
|
||||
if np.any(np.isnan(est)):
|
||||
raise AssertionError('nan returned in fit')
|
||||
else:
|
||||
if np.all(np.abs(diff) <= diffthreshold):
|
||||
break
|
||||
else:
|
||||
if np.any((np.abs(diff) - diffthreshold) > 0.0):
|
||||
## txt = 'WARNING - diff too large with small sample'
|
||||
## print 'parameter diff =', diff - diffthreshold, txt
|
||||
rvs = np.concatenate([rvs,distfn.rvs(size=n_repl2-n_repl1,*arg)])
|
||||
est = distfn.fit(rvs) #,*arg)
|
||||
truearg = np.hstack([arg,[0.0,1.0]])[:n]
|
||||
diff = est-truearg
|
||||
if np.any((np.abs(diff) - diffthreshold) > 0.0):
|
||||
txt = 'parameter: %s\n' % str(truearg)
|
||||
txt += 'estimated: %s\n' % str(est)
|
||||
txt += 'diff : %s\n' % str(diff)
|
||||
raise AssertionError('fit not very good in %s\n' % distfn.name + txt)
|
||||
txt = 'parameter: %s\n' % str(truearg)
|
||||
txt += 'estimated: %s\n' % str(est)
|
||||
txt += 'diff : %s\n' % str(diff)
|
||||
raise AssertionError('fit not very good in %s\n' % distfn.name + txt)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_cont_fit('bradford', (0.29891359763170633,))
|
||||
# check_cont_fit('lognorm', (10,1,1))
|
||||
# check_cont_fit('ncx2', (21, 1.0560465975116415))
|
||||
import nose
|
||||
#nose.run(argv=['', __file__])
|
||||
nose.runmodule(argv=[__file__,'-s'], exit=False)
|
||||
np.testing.run_module_suite()
|
||||
|
@ -0,0 +1,789 @@
|
||||
# Author: Travis Oliphant, 2002
|
||||
#
|
||||
# Further enhancements and tests added by numerous SciPy developers.
|
||||
#
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from numpy.random import RandomState
|
||||
from numpy.testing import (TestCase, run_module_suite, assert_array_equal,
|
||||
assert_almost_equal, assert_array_less, assert_array_almost_equal,
|
||||
assert_raises, assert_, assert_allclose, assert_equal, dec)
|
||||
|
||||
from scipy import stats
|
||||
|
||||
# Matplotlib is not a scipy dependency but is optionally used in probplot, so
|
||||
# check if it's available
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
have_matplotlib = True
|
||||
except:
|
||||
have_matplotlib = False
|
||||
|
||||
|
||||
g1 = [1.006, 0.996, 0.998, 1.000, 0.992, 0.993, 1.002, 0.999, 0.994, 1.000]
|
||||
g2 = [0.998, 1.006, 1.000, 1.002, 0.997, 0.998, 0.996, 1.000, 1.006, 0.988]
|
||||
g3 = [0.991, 0.987, 0.997, 0.999, 0.995, 0.994, 1.000, 0.999, 0.996, 0.996]
|
||||
g4 = [1.005, 1.002, 0.994, 1.000, 0.995, 0.994, 0.998, 0.996, 1.002, 0.996]
|
||||
g5 = [0.998, 0.998, 0.982, 0.990, 1.002, 0.984, 0.996, 0.993, 0.980, 0.996]
|
||||
g6 = [1.009, 1.013, 1.009, 0.997, 0.988, 1.002, 0.995, 0.998, 0.981, 0.996]
|
||||
g7 = [0.990, 1.004, 0.996, 1.001, 0.998, 1.000, 1.018, 1.010, 0.996, 1.002]
|
||||
g8 = [0.998, 1.000, 1.006, 1.000, 1.002, 0.996, 0.998, 0.996, 1.002, 1.006]
|
||||
g9 = [1.002, 0.998, 0.996, 0.995, 0.996, 1.004, 1.004, 0.998, 0.999, 0.991]
|
||||
g10 = [0.991, 0.995, 0.984, 0.994, 0.997, 0.997, 0.991, 0.998, 1.004, 0.997]
|
||||
|
||||
|
||||
class TestShapiro(TestCase):
|
||||
def test_basic(self):
|
||||
x1 = [0.11,7.87,4.61,10.14,7.95,3.14,0.46,
|
||||
4.43,0.21,4.75,0.71,1.52,3.24,
|
||||
0.93,0.42,4.97,9.53,4.55,0.47,6.66]
|
||||
w,pw = stats.shapiro(x1)
|
||||
assert_almost_equal(w,0.90047299861907959,6)
|
||||
assert_almost_equal(pw,0.042089745402336121,6)
|
||||
x2 = [1.36,1.14,2.92,2.55,1.46,1.06,5.27,-1.11,
|
||||
3.48,1.10,0.88,-0.51,1.46,0.52,6.20,1.69,
|
||||
0.08,3.67,2.81,3.49]
|
||||
w,pw = stats.shapiro(x2)
|
||||
assert_almost_equal(w,0.9590270,6)
|
||||
assert_almost_equal(pw,0.52460,3)
|
||||
|
||||
def test_bad_arg(self):
|
||||
# Length of x is less than 3.
|
||||
x = [1]
|
||||
assert_raises(ValueError, stats.shapiro, x)
|
||||
|
||||
|
||||
class TestAnderson(TestCase):
|
||||
def test_normal(self):
|
||||
rs = RandomState(1234567890)
|
||||
x1 = rs.standard_exponential(size=50)
|
||||
x2 = rs.standard_normal(size=50)
|
||||
A,crit,sig = stats.anderson(x1)
|
||||
assert_array_less(crit[:-1], A)
|
||||
A,crit,sig = stats.anderson(x2)
|
||||
assert_array_less(A, crit[-2:])
|
||||
|
||||
def test_expon(self):
|
||||
rs = RandomState(1234567890)
|
||||
x1 = rs.standard_exponential(size=50)
|
||||
x2 = rs.standard_normal(size=50)
|
||||
A,crit,sig = stats.anderson(x1,'expon')
|
||||
assert_array_less(A, crit[-2:])
|
||||
olderr = np.seterr(all='ignore')
|
||||
try:
|
||||
A,crit,sig = stats.anderson(x2,'expon')
|
||||
finally:
|
||||
np.seterr(**olderr)
|
||||
assert_(A > crit[-1])
|
||||
|
||||
def test_bad_arg(self):
|
||||
assert_raises(ValueError, stats.anderson, [1], dist='plate_of_shrimp')
|
||||
|
||||
|
||||
class TestAnsari(TestCase):
|
||||
|
||||
def test_small(self):
|
||||
x = [1,2,3,3,4]
|
||||
y = [3,2,6,1,6,1,4,1]
|
||||
W, pval = stats.ansari(x,y)
|
||||
assert_almost_equal(W,23.5,11)
|
||||
assert_almost_equal(pval,0.13499256881897437,11)
|
||||
|
||||
def test_approx(self):
|
||||
ramsay = np.array((111, 107, 100, 99, 102, 106, 109, 108, 104, 99,
|
||||
101, 96, 97, 102, 107, 113, 116, 113, 110, 98))
|
||||
parekh = np.array((107, 108, 106, 98, 105, 103, 110, 105, 104,
|
||||
100, 96, 108, 103, 104, 114, 114, 113, 108, 106, 99))
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings('ignore',
|
||||
message="Ties preclude use of exact statistic.")
|
||||
W, pval = stats.ansari(ramsay, parekh)
|
||||
|
||||
assert_almost_equal(W,185.5,11)
|
||||
assert_almost_equal(pval,0.18145819972867083,11)
|
||||
|
||||
def test_exact(self):
|
||||
W,pval = stats.ansari([1,2,3,4],[15,5,20,8,10,12])
|
||||
assert_almost_equal(W,10.0,11)
|
||||
assert_almost_equal(pval,0.533333333333333333,7)
|
||||
|
||||
def test_bad_arg(self):
|
||||
assert_raises(ValueError, stats.ansari, [], [1])
|
||||
assert_raises(ValueError, stats.ansari, [1], [])
|
||||
|
||||
|
||||
class TestBartlett(TestCase):
|
||||
|
||||
def test_data(self):
|
||||
args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10]
|
||||
T, pval = stats.bartlett(*args)
|
||||
assert_almost_equal(T,20.78587342806484,7)
|
||||
assert_almost_equal(pval,0.0136358632781,7)
|
||||
|
||||
def test_bad_arg(self):
|
||||
# Too few args raises ValueError.
|
||||
assert_raises(ValueError, stats.bartlett, [1])
|
||||
|
||||
|
||||
class TestLevene(TestCase):
|
||||
|
||||
def test_data(self):
|
||||
args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10]
|
||||
W, pval = stats.levene(*args)
|
||||
assert_almost_equal(W,1.7059176930008939,7)
|
||||
assert_almost_equal(pval,0.0990829755522,7)
|
||||
|
||||
def test_trimmed1(self):
|
||||
# Test that center='trimmed' gives the same result as center='mean'
|
||||
# when proportiontocut=0.
|
||||
W1, pval1 = stats.levene(g1, g2, g3, center='mean')
|
||||
W2, pval2 = stats.levene(g1, g2, g3, center='trimmed', proportiontocut=0.0)
|
||||
assert_almost_equal(W1, W2)
|
||||
assert_almost_equal(pval1, pval2)
|
||||
|
||||
def test_trimmed2(self):
|
||||
x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0]
|
||||
y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0]
|
||||
np.random.seed(1234)
|
||||
x2 = np.random.permutation(x)
|
||||
|
||||
# Use center='trimmed'
|
||||
W0, pval0 = stats.levene(x, y, center='trimmed', proportiontocut=0.125)
|
||||
W1, pval1 = stats.levene(x2, y, center='trimmed', proportiontocut=0.125)
|
||||
# Trim the data here, and use center='mean'
|
||||
W2, pval2 = stats.levene(x[1:-1], y[1:-1], center='mean')
|
||||
# Result should be the same.
|
||||
assert_almost_equal(W0, W2)
|
||||
assert_almost_equal(W1, W2)
|
||||
assert_almost_equal(pval1, pval2)
|
||||
|
||||
def test_equal_mean_median(self):
|
||||
x = np.linspace(-1,1,21)
|
||||
np.random.seed(1234)
|
||||
x2 = np.random.permutation(x)
|
||||
y = x**3
|
||||
W1, pval1 = stats.levene(x, y, center='mean')
|
||||
W2, pval2 = stats.levene(x2, y, center='median')
|
||||
assert_almost_equal(W1, W2)
|
||||
assert_almost_equal(pval1, pval2)
|
||||
|
||||
def test_bad_keyword(self):
|
||||
x = np.linspace(-1,1,21)
|
||||
assert_raises(TypeError, stats.levene, x, x, portiontocut=0.1)
|
||||
|
||||
def test_bad_center_value(self):
|
||||
x = np.linspace(-1,1,21)
|
||||
assert_raises(ValueError, stats.levene, x, x, center='trim')
|
||||
|
||||
def test_too_few_args(self):
|
||||
assert_raises(ValueError, stats.levene, [1])
|
||||
|
||||
|
||||
class TestBinomP(TestCase):
|
||||
|
||||
def test_data(self):
|
||||
pval = stats.binom_test(100,250)
|
||||
assert_almost_equal(pval,0.0018833009350757682,11)
|
||||
pval = stats.binom_test(201,405)
|
||||
assert_almost_equal(pval,0.92085205962670713,11)
|
||||
pval = stats.binom_test([682,243],p=3.0/4)
|
||||
assert_almost_equal(pval,0.38249155957481695,11)
|
||||
|
||||
def test_bad_len_x(self):
|
||||
# Length of x must be 1 or 2.
|
||||
assert_raises(ValueError, stats.binom_test, [1,2,3])
|
||||
|
||||
def test_bad_n(self):
|
||||
# len(x) is 1, but n is invalid.
|
||||
# Missing n
|
||||
assert_raises(ValueError, stats.binom_test, [100])
|
||||
# n less than x[0]
|
||||
assert_raises(ValueError, stats.binom_test, [100], n=50)
|
||||
|
||||
def test_bad_p(self):
|
||||
assert_raises(ValueError, stats.binom_test, [50, 50], p=2.0)
|
||||
|
||||
|
||||
class TestFindRepeats(TestCase):
|
||||
|
||||
def test_basic(self):
|
||||
a = [1,2,3,4,1,2,3,4,1,2,5]
|
||||
res,nums = stats.find_repeats(a)
|
||||
assert_array_equal(res,[1,2,3,4])
|
||||
assert_array_equal(nums,[3,3,2,2])
|
||||
|
||||
def test_empty_result(self):
|
||||
# Check that empty arrays are returned when there are no repeats.
|
||||
a = [10, 20, 50, 30, 40]
|
||||
repeated, counts = stats.find_repeats(a)
|
||||
assert_array_equal(repeated, [])
|
||||
assert_array_equal(counts, [])
|
||||
|
||||
|
||||
class TestFligner(TestCase):
|
||||
|
||||
def test_data(self):
|
||||
# numbers from R: fligner.test in package stats
|
||||
x1 = np.arange(5)
|
||||
assert_array_almost_equal(stats.fligner(x1,x1**2),
|
||||
(3.2282229927203536, 0.072379187848207877), 11)
|
||||
|
||||
def test_trimmed1(self):
|
||||
# Test that center='trimmed' gives the same result as center='mean'
|
||||
# when proportiontocut=0.
|
||||
Xsq1, pval1 = stats.fligner(g1, g2, g3, center='mean')
|
||||
Xsq2, pval2 = stats.fligner(g1, g2, g3, center='trimmed', proportiontocut=0.0)
|
||||
assert_almost_equal(Xsq1, Xsq2)
|
||||
assert_almost_equal(pval1, pval2)
|
||||
|
||||
def test_trimmed2(self):
|
||||
x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0]
|
||||
y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0]
|
||||
# Use center='trimmed'
|
||||
Xsq1, pval1 = stats.fligner(x, y, center='trimmed', proportiontocut=0.125)
|
||||
# Trim the data here, and use center='mean'
|
||||
Xsq2, pval2 = stats.fligner(x[1:-1], y[1:-1], center='mean')
|
||||
# Result should be the same.
|
||||
assert_almost_equal(Xsq1, Xsq2)
|
||||
assert_almost_equal(pval1, pval2)
|
||||
|
||||
# The following test looks reasonable at first, but fligner() uses the
|
||||
# function stats.rankdata(), and in one of the cases in this test,
|
||||
# there are ties, while in the other (because of normal rounding
|
||||
# errors) there are not. This difference leads to differences in the
|
||||
# third significant digit of W.
|
||||
#
|
||||
#def test_equal_mean_median(self):
|
||||
# x = np.linspace(-1,1,21)
|
||||
# y = x**3
|
||||
# W1, pval1 = stats.fligner(x, y, center='mean')
|
||||
# W2, pval2 = stats.fligner(x, y, center='median')
|
||||
# assert_almost_equal(W1, W2)
|
||||
# assert_almost_equal(pval1, pval2)
|
||||
|
||||
def test_bad_keyword(self):
|
||||
x = np.linspace(-1,1,21)
|
||||
assert_raises(TypeError, stats.fligner, x, x, portiontocut=0.1)
|
||||
|
||||
def test_bad_center_value(self):
|
||||
x = np.linspace(-1,1,21)
|
||||
assert_raises(ValueError, stats.fligner, x, x, center='trim')
|
||||
|
||||
def test_bad_num_args(self):
|
||||
# Too few args raises ValueError.
|
||||
assert_raises(ValueError, stats.fligner, [1])
|
||||
|
||||
|
||||
class TestMood(TestCase):
|
||||
def test_mood(self):
|
||||
# numbers from R: mood.test in package stats
|
||||
x1 = np.arange(5)
|
||||
assert_array_almost_equal(stats.mood(x1, x1**2),
|
||||
(-1.3830857299399906, 0.16663858066771478), 11)
|
||||
|
||||
def test_mood_order_of_args(self):
|
||||
# z should change sign when the order of arguments changes, pvalue
|
||||
# should not change
|
||||
np.random.seed(1234)
|
||||
x1 = np.random.randn(10, 1)
|
||||
x2 = np.random.randn(15, 1)
|
||||
z1, p1 = stats.mood(x1, x2)
|
||||
z2, p2 = stats.mood(x2, x1)
|
||||
assert_array_almost_equal([z1, p1], [-z2, p2])
|
||||
|
||||
def test_mood_with_axis_none(self):
|
||||
#Test with axis = None, compare with results from R
|
||||
x1 = [-0.626453810742332, 0.183643324222082, -0.835628612410047,
|
||||
1.59528080213779, 0.329507771815361, -0.820468384118015,
|
||||
0.487429052428485, 0.738324705129217, 0.575781351653492,
|
||||
-0.305388387156356, 1.51178116845085, 0.389843236411431,
|
||||
-0.621240580541804, -2.2146998871775, 1.12493091814311,
|
||||
-0.0449336090152309, -0.0161902630989461, 0.943836210685299,
|
||||
0.821221195098089, 0.593901321217509]
|
||||
|
||||
x2 = [-0.896914546624981, 0.184849184646742, 1.58784533120882,
|
||||
-1.13037567424629, -0.0802517565509893, 0.132420284381094,
|
||||
0.707954729271733, -0.23969802417184, 1.98447393665293,
|
||||
-0.138787012119665, 0.417650750792556, 0.981752777463662,
|
||||
-0.392695355503813, -1.03966897694891, 1.78222896030858,
|
||||
-2.31106908460517, 0.878604580921265, 0.035806718015226,
|
||||
1.01282869212708, 0.432265154539617, 2.09081920524915,
|
||||
-1.19992581964387, 1.58963820029007, 1.95465164222325,
|
||||
0.00493777682814261, -2.45170638784613, 0.477237302613617,
|
||||
-0.596558168631403, 0.792203270299649, 0.289636710177348]
|
||||
|
||||
x1 = np.array(x1)
|
||||
x2 = np.array(x2)
|
||||
x1.shape = (10, 2)
|
||||
x2.shape = (15, 2)
|
||||
assert_array_almost_equal(stats.mood(x1, x2, axis=None),
|
||||
[-1.31716607555, 0.18778296257])
|
||||
|
||||
def test_mood_2d(self):
|
||||
# Test if the results of mood test in 2-D case are consistent with the
|
||||
# R result for the same inputs. Numbers from R mood.test().
|
||||
ny = 5
|
||||
np.random.seed(1234)
|
||||
x1 = np.random.randn(10, ny)
|
||||
x2 = np.random.randn(15, ny)
|
||||
z_vectest, pval_vectest = stats.mood(x1, x2)
|
||||
|
||||
for j in range(ny):
|
||||
assert_array_almost_equal([z_vectest[j], pval_vectest[j]],
|
||||
stats.mood(x1[:, j], x2[:, j]))
|
||||
|
||||
# inverse order of dimensions
|
||||
x1 = x1.transpose()
|
||||
x2 = x2.transpose()
|
||||
z_vectest, pval_vectest = stats.mood(x1, x2, axis=1)
|
||||
|
||||
for i in range(ny):
|
||||
# check axis handling is self consistent
|
||||
assert_array_almost_equal([z_vectest[i], pval_vectest[i]],
|
||||
stats.mood(x1[i, :], x2[i, :]))
|
||||
|
||||
def test_mood_3d(self):
|
||||
shape = (10, 5, 6)
|
||||
np.random.seed(1234)
|
||||
x1 = np.random.randn(*shape)
|
||||
x2 = np.random.randn(*shape)
|
||||
|
||||
for axis in range(3):
|
||||
z_vectest, pval_vectest = stats.mood(x1, x2, axis=axis)
|
||||
# Tests that result for 3-D arrays is equal to that for the
|
||||
# same calculation on a set of 1-D arrays taken from the
|
||||
# 3-D array
|
||||
axes_idx = ([1, 2], [0, 2], [0, 1]) # the two axes != axis
|
||||
for i in range(shape[axes_idx[axis][0]]):
|
||||
for j in range(shape[axes_idx[axis][1]]):
|
||||
if axis == 0:
|
||||
slice1 = x1[:, i, j]
|
||||
slice2 = x2[:, i, j]
|
||||
elif axis == 1:
|
||||
slice1 = x1[i, :, j]
|
||||
slice2 = x2[i, :, j]
|
||||
else:
|
||||
slice1 = x1[i, j, :]
|
||||
slice2 = x2[i, j, :]
|
||||
|
||||
assert_array_almost_equal([z_vectest[i, j],
|
||||
pval_vectest[i, j]],
|
||||
stats.mood(slice1, slice2))
|
||||
|
||||
def test_mood_bad_arg(self):
|
||||
# Raise ValueError when the sum of the lengths of the args is less than 3
|
||||
assert_raises(ValueError, stats.mood, [1], [])
|
||||
|
||||
|
||||
class TestProbplot(TestCase):
|
||||
|
||||
def test_basic(self):
|
||||
np.random.seed(12345)
|
||||
x = stats.norm.rvs(size=20)
|
||||
osm, osr = stats.probplot(x, fit=False)
|
||||
osm_expected = [-1.8241636, -1.38768012, -1.11829229, -0.91222575,
|
||||
-0.73908135, -0.5857176, -0.44506467, -0.31273668,
|
||||
-0.18568928, -0.06158146, 0.06158146, 0.18568928,
|
||||
0.31273668, 0.44506467, 0.5857176, 0.73908135,
|
||||
0.91222575, 1.11829229, 1.38768012, 1.8241636]
|
||||
assert_allclose(osr, np.sort(x))
|
||||
assert_allclose(osm, osm_expected)
|
||||
|
||||
res, res_fit = stats.probplot(x, fit=True)
|
||||
res_fit_expected = [1.05361841, 0.31297795, 0.98741609]
|
||||
assert_allclose(res_fit, res_fit_expected)
|
||||
|
||||
def test_sparams_keyword(self):
|
||||
np.random.seed(123456)
|
||||
x = stats.norm.rvs(size=100)
|
||||
# Check that None, () and 0 (loc=0, for normal distribution) all work
|
||||
# and give the same results
|
||||
osm1, osr1 = stats.probplot(x, sparams=None, fit=False)
|
||||
osm2, osr2 = stats.probplot(x, sparams=0, fit=False)
|
||||
osm3, osr3 = stats.probplot(x, sparams=(), fit=False)
|
||||
assert_allclose(osm1, osm2)
|
||||
assert_allclose(osm1, osm3)
|
||||
assert_allclose(osr1, osr2)
|
||||
assert_allclose(osr1, osr3)
|
||||
# Check giving (loc, scale) params for normal distribution
|
||||
osm, osr = stats.probplot(x, sparams=(), fit=False)
|
||||
|
||||
def test_dist_keyword(self):
|
||||
np.random.seed(12345)
|
||||
x = stats.norm.rvs(size=20)
|
||||
osm1, osr1 = stats.probplot(x, fit=False, dist='t', sparams=(3,))
|
||||
osm2, osr2 = stats.probplot(x, fit=False, dist=stats.t, sparams=(3,))
|
||||
assert_allclose(osm1, osm2)
|
||||
assert_allclose(osr1, osr2)
|
||||
|
||||
assert_raises(ValueError, stats.probplot, x, dist='wrong-dist-name')
|
||||
assert_raises(AttributeError, stats.probplot, x, dist=[])
|
||||
|
||||
class custom_dist(object):
|
||||
"""Some class that looks just enough like a distribution."""
|
||||
def ppf(self, q):
|
||||
return stats.norm.ppf(q, loc=2)
|
||||
|
||||
osm1, osr1 = stats.probplot(x, sparams=(2,), fit=False)
|
||||
osm2, osr2 = stats.probplot(x, dist=custom_dist(), fit=False)
|
||||
assert_allclose(osm1, osm2)
|
||||
assert_allclose(osr1, osr2)
|
||||
|
||||
@dec.skipif(not have_matplotlib)
|
||||
def test_plot_kwarg(self):
|
||||
np.random.seed(7654321)
|
||||
fig = plt.figure()
|
||||
fig.add_subplot(111)
|
||||
x = stats.t.rvs(3, size=100)
|
||||
res1, fitres1 = stats.probplot(x, plot=plt)
|
||||
plt.close()
|
||||
res2, fitres2 = stats.probplot(x, plot=None)
|
||||
res3 = stats.probplot(x, fit=False, plot=plt)
|
||||
plt.close()
|
||||
res4 = stats.probplot(x, fit=False, plot=None)
|
||||
# Check that results are consistent between combinations of `fit` and
|
||||
# `plot` keywords.
|
||||
assert_(len(res1) == len(res2) == len(res3) == len(res4) == 2)
|
||||
assert_allclose(res1, res2)
|
||||
assert_allclose(res1, res3)
|
||||
assert_allclose(res1, res4)
|
||||
assert_allclose(fitres1, fitres2)
|
||||
|
||||
# Check that a Matplotlib Axes object is accepted
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
stats.probplot(x, fit=False, plot=ax)
|
||||
plt.close()
|
||||
|
||||
def test_probplot_bad_args(self):
|
||||
# Raise ValueError when given an invalid distribution.
|
||||
assert_raises(ValueError, stats.probplot, [1], dist="plate_of_shrimp")
|
||||
|
||||
|
||||
def test_wilcoxon_bad_arg():
|
||||
# Raise ValueError when two args of different lengths are given or
|
||||
# zero_method is unknown.
|
||||
assert_raises(ValueError, stats.wilcoxon, [1], [1,2])
|
||||
assert_raises(ValueError, stats.wilcoxon, [1,2], [1,2], "dummy")
|
||||
|
||||
|
||||
def test_mvsdist_bad_arg():
|
||||
# Raise ValueError if fewer than two data points are given.
|
||||
data = [1]
|
||||
assert_raises(ValueError, stats.mvsdist, data)
|
||||
|
||||
|
||||
def test_kstat_bad_arg():
|
||||
# Raise ValueError if n > 4 or n > 1.
|
||||
data = [1]
|
||||
n = 10
|
||||
assert_raises(ValueError, stats.kstat, data, n=n)
|
||||
|
||||
|
||||
def test_kstatvar_bad_arg():
|
||||
# Raise ValueError is n is not 1 or 2.
|
||||
data = [1]
|
||||
n = 10
|
||||
assert_raises(ValueError, stats.kstatvar, data, n=n)
|
||||
|
||||
|
||||
def test_ppcc_max_bad_arg():
|
||||
# Raise ValueError when given an invalid distribution.
|
||||
data = [1]
|
||||
assert_raises(ValueError, stats.ppcc_max, data, dist="plate_of_shrimp")
|
||||
|
||||
|
||||
class TestBoxcox_llf(TestCase):
|
||||
|
||||
def test_basic(self):
|
||||
np.random.seed(54321)
|
||||
x = stats.norm.rvs(size=10000, loc=10)
|
||||
lmbda = 1
|
||||
llf = stats.boxcox_llf(lmbda, x)
|
||||
llf_expected = -x.size / 2. * np.log(np.sum(x.std()**2))
|
||||
assert_allclose(llf, llf_expected)
|
||||
|
||||
def test_array_like(self):
|
||||
np.random.seed(54321)
|
||||
x = stats.norm.rvs(size=100, loc=10)
|
||||
lmbda = 1
|
||||
llf = stats.boxcox_llf(lmbda, x)
|
||||
llf2 = stats.boxcox_llf(lmbda, list(x))
|
||||
assert_allclose(llf, llf2, rtol=1e-12)
|
||||
|
||||
def test_2d_input(self):
|
||||
# Note: boxcox_llf() was already working with 2-D input (sort of), so
|
||||
# keep it like that. boxcox() doesn't work with 2-D input though, due
|
||||
# to brent() returning a scalar.
|
||||
np.random.seed(54321)
|
||||
x = stats.norm.rvs(size=100, loc=10)
|
||||
lmbda = 1
|
||||
llf = stats.boxcox_llf(lmbda, x)
|
||||
llf2 = stats.boxcox_llf(lmbda, np.vstack([x, x]).T)
|
||||
assert_allclose([llf, llf], llf2, rtol=1e-12)
|
||||
|
||||
def test_empty(self):
|
||||
assert_(np.isnan(stats.boxcox_llf(1, [])))
|
||||
|
||||
|
||||
class TestBoxcox(TestCase):
|
||||
|
||||
def test_fixed_lmbda(self):
|
||||
np.random.seed(12345)
|
||||
x = stats.loggamma.rvs(5, size=50) + 5
|
||||
xt = stats.boxcox(x, lmbda=1)
|
||||
assert_allclose(xt, x - 1)
|
||||
xt = stats.boxcox(x, lmbda=-1)
|
||||
assert_allclose(xt, 1 - 1/x)
|
||||
|
||||
xt = stats.boxcox(x, lmbda=0)
|
||||
assert_allclose(xt, np.log(x))
|
||||
|
||||
# Also test that array_like input works
|
||||
xt = stats.boxcox(list(x), lmbda=0)
|
||||
assert_allclose(xt, np.log(x))
|
||||
|
||||
def test_lmbda_None(self):
|
||||
np.random.seed(1234567)
|
||||
# Start from normal rv's, do inverse transform to check that
|
||||
# optimization function gets close to the right answer.
|
||||
np.random.seed(1245)
|
||||
lmbda = 2.5
|
||||
x = stats.norm.rvs(loc=10, size=50000)
|
||||
x_inv = (x * lmbda + 1)**(-lmbda)
|
||||
xt, maxlog = stats.boxcox(x_inv)
|
||||
|
||||
assert_almost_equal(maxlog, -1 / lmbda, decimal=2)
|
||||
|
||||
def test_alpha(self):
|
||||
np.random.seed(1234)
|
||||
x = stats.loggamma.rvs(5, size=50) + 5
|
||||
|
||||
# Some regular values for alpha, on a small sample size
|
||||
_, _, interval = stats.boxcox(x, alpha=0.75)
|
||||
assert_allclose(interval, [4.004485780226041, 5.138756355035744])
|
||||
_, _, interval = stats.boxcox(x, alpha=0.05)
|
||||
assert_allclose(interval, [1.2138178554857557, 8.209033272375663])
|
||||
|
||||
# Try some extreme values, see we don't hit the N=500 limit
|
||||
x = stats.loggamma.rvs(7, size=500) + 15
|
||||
_, _, interval = stats.boxcox(x, alpha=0.001)
|
||||
assert_allclose(interval, [0.3988867, 11.40553131])
|
||||
_, _, interval = stats.boxcox(x, alpha=0.999)
|
||||
assert_allclose(interval, [5.83316246, 5.83735292])
|
||||
|
||||
def test_boxcox_bad_arg(self):
|
||||
# Raise ValueError if any data value is negative.
|
||||
x = np.array([-1])
|
||||
assert_raises(ValueError, stats.boxcox, x)
|
||||
|
||||
def test_empty(self):
|
||||
assert_(stats.boxcox([]).shape == (0,))
|
||||
|
||||
|
||||
class TestBoxcoxNormmax(TestCase):
|
||||
def setUp(self):
|
||||
np.random.seed(12345)
|
||||
self.x = stats.loggamma.rvs(5, size=50) + 5
|
||||
|
||||
def test_pearsonr(self):
|
||||
maxlog = stats.boxcox_normmax(self.x)
|
||||
assert_allclose(maxlog, 1.804465325046)
|
||||
|
||||
def test_mle(self):
|
||||
maxlog = stats.boxcox_normmax(self.x, method='mle')
|
||||
assert_allclose(maxlog, 1.758101454114)
|
||||
|
||||
# Check that boxcox() uses 'mle'
|
||||
_, maxlog_boxcox = stats.boxcox(self.x)
|
||||
assert_allclose(maxlog_boxcox, maxlog)
|
||||
|
||||
def test_all(self):
|
||||
maxlog_all = stats.boxcox_normmax(self.x, method='all')
|
||||
assert_allclose(maxlog_all, [1.804465325046, 1.758101454114])
|
||||
|
||||
|
||||
class TestBoxcoxNormplot(TestCase):
|
||||
def setUp(self):
|
||||
np.random.seed(7654321)
|
||||
self.x = stats.loggamma.rvs(5, size=500) + 5
|
||||
|
||||
def test_basic(self):
|
||||
N = 5
|
||||
lmbdas, ppcc = stats.boxcox_normplot(self.x, -10, 10, N=N)
|
||||
ppcc_expected = [0.57783375, 0.83610988, 0.97524311, 0.99756057,
|
||||
0.95843297]
|
||||
assert_allclose(lmbdas, np.linspace(-10, 10, num=N))
|
||||
assert_allclose(ppcc, ppcc_expected)
|
||||
|
||||
@dec.skipif(not have_matplotlib)
|
||||
def test_plot_kwarg(self):
|
||||
# Check with the matplotlib.pyplot module
|
||||
fig = plt.figure()
|
||||
fig.add_subplot(111)
|
||||
stats.boxcox_normplot(self.x, -20, 20, plot=plt)
|
||||
plt.close()
|
||||
|
||||
# Check that a Matplotlib Axes object is accepted
|
||||
fig.add_subplot(111)
|
||||
ax = fig.add_subplot(111)
|
||||
stats.boxcox_normplot(self.x, -20, 20, plot=ax)
|
||||
plt.close()
|
||||
|
||||
def test_invalid_inputs(self):
|
||||
# `lb` has to be larger than `la`
|
||||
assert_raises(ValueError, stats.boxcox_normplot, self.x, 1, 0)
|
||||
# `x` can not contain negative values
|
||||
assert_raises(ValueError, stats.boxcox_normplot, [-1, 1] , 0, 1)
|
||||
|
||||
def test_empty(self):
|
||||
assert_(stats.boxcox_normplot([], 0, 1).size == 0)
|
||||
|
||||
|
||||
class TestCircFuncs(TestCase):
|
||||
def test_circfuncs(self):
|
||||
x = np.array([355,5,2,359,10,350])
|
||||
M = stats.circmean(x, high=360)
|
||||
Mval = 0.167690146
|
||||
assert_allclose(M, Mval, rtol=1e-7)
|
||||
|
||||
V = stats.circvar(x, high=360)
|
||||
Vval = 42.51955609
|
||||
assert_allclose(V, Vval, rtol=1e-7)
|
||||
|
||||
S = stats.circstd(x, high=360)
|
||||
Sval = 6.520702116
|
||||
assert_allclose(S, Sval, rtol=1e-7)
|
||||
|
||||
def test_circfuncs_small(self):
|
||||
x = np.array([20,21,22,18,19,20.5,19.2])
|
||||
M1 = x.mean()
|
||||
M2 = stats.circmean(x, high=360)
|
||||
assert_allclose(M2, M1, rtol=1e-5)
|
||||
|
||||
V1 = x.var()
|
||||
V2 = stats.circvar(x, high=360)
|
||||
assert_allclose(V2, V1, rtol=1e-4)
|
||||
|
||||
S1 = x.std()
|
||||
S2 = stats.circstd(x, high=360)
|
||||
assert_allclose(S2, S1, rtol=1e-4)
|
||||
|
||||
def test_circmean_axis(self):
|
||||
x = np.array([[355,5,2,359,10,350],
|
||||
[351,7,4,352,9,349],
|
||||
[357,9,8,358,4,356]])
|
||||
M1 = stats.circmean(x, high=360)
|
||||
M2 = stats.circmean(x.ravel(), high=360)
|
||||
assert_allclose(M1, M2, rtol=1e-14)
|
||||
|
||||
M1 = stats.circmean(x, high=360, axis=1)
|
||||
M2 = [stats.circmean(x[i], high=360) for i in range(x.shape[0])]
|
||||
assert_allclose(M1, M2, rtol=1e-14)
|
||||
|
||||
M1 = stats.circmean(x, high=360, axis=0)
|
||||
M2 = [stats.circmean(x[:,i], high=360) for i in range(x.shape[1])]
|
||||
assert_allclose(M1, M2, rtol=1e-14)
|
||||
|
||||
def test_circvar_axis(self):
|
||||
x = np.array([[355,5,2,359,10,350],
|
||||
[351,7,4,352,9,349],
|
||||
[357,9,8,358,4,356]])
|
||||
|
||||
V1 = stats.circvar(x, high=360)
|
||||
V2 = stats.circvar(x.ravel(), high=360)
|
||||
assert_allclose(V1, V2, rtol=1e-11)
|
||||
|
||||
V1 = stats.circvar(x, high=360, axis=1)
|
||||
V2 = [stats.circvar(x[i], high=360) for i in range(x.shape[0])]
|
||||
assert_allclose(V1, V2, rtol=1e-11)
|
||||
|
||||
V1 = stats.circvar(x, high=360, axis=0)
|
||||
V2 = [stats.circvar(x[:,i], high=360) for i in range(x.shape[1])]
|
||||
assert_allclose(V1, V2, rtol=1e-11)
|
||||
|
||||
def test_circstd_axis(self):
|
||||
x = np.array([[355,5,2,359,10,350],
|
||||
[351,7,4,352,9,349],
|
||||
[357,9,8,358,4,356]])
|
||||
|
||||
S1 = stats.circstd(x, high=360)
|
||||
S2 = stats.circstd(x.ravel(), high=360)
|
||||
assert_allclose(S1, S2, rtol=1e-11)
|
||||
|
||||
S1 = stats.circstd(x, high=360, axis=1)
|
||||
S2 = [stats.circstd(x[i], high=360) for i in range(x.shape[0])]
|
||||
assert_allclose(S1, S2, rtol=1e-11)
|
||||
|
||||
S1 = stats.circstd(x, high=360, axis=0)
|
||||
S2 = [stats.circstd(x[:,i], high=360) for i in range(x.shape[1])]
|
||||
assert_allclose(S1, S2, rtol=1e-11)
|
||||
|
||||
def test_circfuncs_array_like(self):
|
||||
x = [355,5,2,359,10,350]
|
||||
assert_allclose(stats.circmean(x, high=360), 0.167690146, rtol=1e-7)
|
||||
assert_allclose(stats.circvar(x, high=360), 42.51955609, rtol=1e-7)
|
||||
assert_allclose(stats.circstd(x, high=360), 6.520702116, rtol=1e-7)
|
||||
|
||||
def test_empty(self):
|
||||
assert_(np.isnan(stats.circmean([])))
|
||||
assert_(np.isnan(stats.circstd([])))
|
||||
assert_(np.isnan(stats.circvar([])))
|
||||
|
||||
|
||||
def test_accuracy_wilcoxon():
|
||||
freq = [1, 4, 16, 15, 8, 4, 5, 1, 2]
|
||||
nums = range(-4, 5)
|
||||
x = np.concatenate([[u] * v for u, v in zip(nums, freq)])
|
||||
y = np.zeros(x.size)
|
||||
|
||||
T, p = stats.wilcoxon(x, y, "pratt")
|
||||
assert_allclose(T, 423)
|
||||
assert_allclose(p, 0.00197547303533107)
|
||||
|
||||
T, p = stats.wilcoxon(x, y, "zsplit")
|
||||
assert_allclose(T, 441)
|
||||
assert_allclose(p, 0.0032145343172473055)
|
||||
|
||||
T, p = stats.wilcoxon(x, y, "wilcox")
|
||||
assert_allclose(T, 327)
|
||||
assert_allclose(p, 0.00641346115861)
|
||||
|
||||
# Test the 'correction' option, using values computed in R with:
|
||||
# > wilcox.test(x, y, paired=TRUE, exact=FALSE, correct={FALSE,TRUE})
|
||||
x = np.array([120, 114, 181, 188, 180, 146, 121, 191, 132, 113, 127, 112])
|
||||
y = np.array([133, 143, 119, 189, 112, 199, 198, 113, 115, 121, 142, 187])
|
||||
T, p = stats.wilcoxon(x, y, correction=False)
|
||||
assert_equal(T, 34)
|
||||
assert_allclose(p, 0.6948866, rtol=1e-6)
|
||||
T, p = stats.wilcoxon(x, y, correction=True)
|
||||
assert_equal(T, 34)
|
||||
assert_allclose(p, 0.7240817, rtol=1e-6)
|
||||
|
||||
|
||||
def test_wilcoxon_tie():
|
||||
# Regression test for gh-2391.
|
||||
# Corresponding R code is:
|
||||
# > result = wilcox.test(rep(0.1, 10), exact=FALSE, correct=FALSE)
|
||||
# > result$p.value
|
||||
# [1] 0.001565402
|
||||
# > result = wilcox.test(rep(0.1, 10), exact=FALSE, correct=TRUE)
|
||||
# > result$p.value
|
||||
# [1] 0.001904195
|
||||
stat, p = stats.wilcoxon([0.1] * 10)
|
||||
expected_p = 0.001565402
|
||||
assert_equal(stat, 0)
|
||||
assert_allclose(p, expected_p, rtol=1e-6)
|
||||
|
||||
stat, p = stats.wilcoxon([0.1] * 10, correction=True)
|
||||
expected_p = 0.001904195
|
||||
assert_equal(stat, 0)
|
||||
assert_allclose(p, expected_p, rtol=1e-6)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_module_suite()
|
@ -0,0 +1,275 @@
|
||||
"""
|
||||
Test functions for multivariate normal distributions.
|
||||
|
||||
"""
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from numpy.testing import (assert_almost_equal,
|
||||
run_module_suite, assert_allclose, assert_equal, assert_raises)
|
||||
|
||||
import numpy
|
||||
import numpy as np
|
||||
|
||||
import scipy.linalg
|
||||
import scipy.stats._multivariate
|
||||
from scipy.stats import multivariate_normal
|
||||
from scipy.stats import norm
|
||||
|
||||
from scipy.stats._multivariate import _psd_pinv_decomposed_log_pdet
|
||||
|
||||
from scipy.integrate import romb
|
||||
|
||||
|
||||
def test_scalar_values():
|
||||
np.random.seed(1234)
|
||||
|
||||
# When evaluated on scalar data, the pdf should return a scalar
|
||||
x, mean, cov = 1.5, 1.7, 2.5
|
||||
pdf = multivariate_normal.pdf(x, mean, cov)
|
||||
assert_equal(pdf.ndim, 0)
|
||||
|
||||
# When evaluated on a single vector, the pdf should return a scalar
|
||||
x = np.random.randn(5)
|
||||
mean = np.random.randn(5)
|
||||
cov = np.abs(np.random.randn(5)) # Diagonal values for cov. matrix
|
||||
pdf = multivariate_normal.pdf(x, mean, cov)
|
||||
assert_equal(pdf.ndim, 0)
|
||||
|
||||
|
||||
def test_logpdf():
|
||||
# Check that the log of the pdf is in fact the logpdf
|
||||
np.random.seed(1234)
|
||||
x = np.random.randn(5)
|
||||
mean = np.random.randn(5)
|
||||
cov = np.abs(np.random.randn(5))
|
||||
d1 = multivariate_normal.logpdf(x, mean, cov)
|
||||
d2 = multivariate_normal.pdf(x, mean, cov)
|
||||
assert_allclose(d1, np.log(d2))
|
||||
|
||||
|
||||
def test_large_pseudo_determinant():
|
||||
# Check that large pseudo-determinants are handled appropriately.
|
||||
|
||||
# Construct a singular diagonal covariance matrix
|
||||
# whose pseudo determinant overflows double precision.
|
||||
large_total_log = 1000.0
|
||||
npos = 100
|
||||
nzero = 2
|
||||
large_entry = np.exp(large_total_log / npos)
|
||||
n = npos + nzero
|
||||
cov = np.zeros((n, n), dtype=float)
|
||||
np.fill_diagonal(cov, large_entry)
|
||||
cov[-nzero:, -nzero:] = 0
|
||||
|
||||
# Check some determinants.
|
||||
assert_equal(scipy.linalg.det(cov), 0)
|
||||
assert_equal(scipy.linalg.det(cov[:npos, :npos]), np.inf)
|
||||
|
||||
# np.linalg.slogdet is only available in numpy 1.6+
|
||||
# but scipy currently supports numpy 1.5.1.
|
||||
#assert_allclose(np.linalg.slogdet(cov[:npos, :npos]), (1, large_total_log))
|
||||
|
||||
# Check the pseudo-determinant.
|
||||
U, log_pdet = scipy.stats._multivariate._psd_pinv_decomposed_log_pdet(cov)
|
||||
assert_allclose(log_pdet, large_total_log)
|
||||
|
||||
|
||||
def test_broadcasting():
|
||||
np.random.seed(1234)
|
||||
n = 4
|
||||
|
||||
# Construct a random covariance matrix.
|
||||
data = np.random.randn(n, n)
|
||||
cov = np.dot(data, data.T)
|
||||
mean = np.random.randn(n)
|
||||
|
||||
# Construct an ndarray which can be interpreted as
|
||||
# a 2x3 array whose elements are random data vectors.
|
||||
X = np.random.randn(2, 3, n)
|
||||
|
||||
# Check that multiple data points can be evaluated at once.
|
||||
for i in range(2):
|
||||
for j in range(3):
|
||||
actual = multivariate_normal.pdf(X[i, j], mean, cov)
|
||||
desired = multivariate_normal.pdf(X, mean, cov)[i, j]
|
||||
assert_allclose(actual, desired)
|
||||
|
||||
|
||||
def test_normal_1D():
|
||||
# The probability density function for a 1D normal variable should
|
||||
# agree with the standard normal distribution in scipy.stats.distributions
|
||||
x = np.linspace(0, 2, 10)
|
||||
mean, cov = 1.2, 0.9
|
||||
scale = cov**0.5
|
||||
d1 = norm.pdf(x, mean, scale)
|
||||
d2 = multivariate_normal.pdf(x, mean, cov)
|
||||
assert_allclose(d1, d2)
|
||||
|
||||
|
||||
def test_marginalization():
|
||||
# Integrating out one of the variables of a 2D Gaussian should
|
||||
# yield a 1D Gaussian
|
||||
mean = np.array([2.5, 3.5])
|
||||
cov = np.array([[.5, 0.2], [0.2, .6]])
|
||||
n = 2**8 + 1 # Number of samples
|
||||
delta = 6 / (n - 1) # Grid spacing
|
||||
|
||||
v = np.linspace(0, 6, n)
|
||||
xv, yv = np.meshgrid(v, v)
|
||||
pos = np.empty((n, n, 2))
|
||||
pos[:, :, 0] = xv
|
||||
pos[:, :, 1] = yv
|
||||
pdf = multivariate_normal.pdf(pos, mean, cov)
|
||||
|
||||
# Marginalize over x and y axis
|
||||
margin_x = romb(pdf, delta, axis=0)
|
||||
margin_y = romb(pdf, delta, axis=1)
|
||||
|
||||
# Compare with standard normal distribution
|
||||
gauss_x = norm.pdf(v, loc=mean[0], scale=cov[0, 0]**0.5)
|
||||
gauss_y = norm.pdf(v, loc=mean[1], scale=cov[1, 1]**0.5)
|
||||
assert_allclose(margin_x, gauss_x, rtol=1e-2, atol=1e-2)
|
||||
assert_allclose(margin_y, gauss_y, rtol=1e-2, atol=1e-2)
|
||||
|
||||
|
||||
def test_frozen():
|
||||
# The frozen distribution should agree with the regular one
|
||||
np.random.seed(1234)
|
||||
x = np.random.randn(5)
|
||||
mean = np.random.randn(5)
|
||||
cov = np.abs(np.random.randn(5))
|
||||
norm_frozen = multivariate_normal(mean, cov)
|
||||
assert_allclose(norm_frozen.pdf(x), multivariate_normal.pdf(x, mean, cov))
|
||||
assert_allclose(norm_frozen.logpdf(x),
|
||||
multivariate_normal.logpdf(x, mean, cov))
|
||||
|
||||
|
||||
def test_pseudodet_pinv():
|
||||
# Make sure that pseudo-inverse and pseudo-det agree on cutoff
|
||||
|
||||
# Assemble random covariance matrix with large and small eigenvalues
|
||||
np.random.seed(1234)
|
||||
n = 7
|
||||
x = np.random.randn(n, n)
|
||||
cov = np.dot(x, x.T)
|
||||
s, u = scipy.linalg.eigh(cov)
|
||||
s = 0.5 * np.ones(n)
|
||||
s[0] = 1.0
|
||||
s[-1] = 1e-7
|
||||
cov = np.dot(u, np.dot(np.diag(s), u.T))
|
||||
|
||||
# Set cond so that the lowest eigenvalue is below the cutoff
|
||||
cond = 1e-5
|
||||
U, log_pdet = _psd_pinv_decomposed_log_pdet(cov, cond)
|
||||
pinv = np.dot(U, U.T)
|
||||
_, log_pdet_pinv = _psd_pinv_decomposed_log_pdet(pinv, cond)
|
||||
|
||||
# Check that the log pseudo-determinant agrees with the sum
|
||||
# of the logs of all but the smallest eigenvalue
|
||||
assert_allclose(log_pdet, np.sum(np.log(s[:-1])))
|
||||
# Check that the pseudo-determinant of the pseudo-inverse
|
||||
# agrees with 1 / pseudo-determinant
|
||||
assert_allclose(-log_pdet, log_pdet_pinv)
|
||||
|
||||
|
||||
def test_exception_nonsquare_cov():
|
||||
cov = [[1, 2, 3], [4, 5, 6]]
|
||||
assert_raises(ValueError, _psd_pinv_decomposed_log_pdet, cov)
|
||||
|
||||
|
||||
def test_exception_nonfinite_cov():
|
||||
cov_nan = [[1, 0], [0, np.nan]]
|
||||
assert_raises(ValueError, _psd_pinv_decomposed_log_pdet, cov_nan)
|
||||
cov_inf = [[1, 0], [0, np.inf]]
|
||||
assert_raises(ValueError, _psd_pinv_decomposed_log_pdet, cov_inf)
|
||||
|
||||
|
||||
def test_exception_non_psd_cov():
|
||||
cov = [[1, 0], [0, -1]]
|
||||
assert_raises(ValueError, _psd_pinv_decomposed_log_pdet, cov)
|
||||
|
||||
|
||||
def test_R_values():
|
||||
# Compare the multivariate pdf with some values precomputed
|
||||
# in R version 3.0.1 (2013-05-16) on Mac OS X 10.6.
|
||||
|
||||
# The values below were generated by the following R-script:
|
||||
# > library(mnormt)
|
||||
# > x <- seq(0, 2, length=5)
|
||||
# > y <- 3*x - 2
|
||||
# > z <- x + cos(y)
|
||||
# > mu <- c(1, 3, 2)
|
||||
# > Sigma <- matrix(c(1,2,0,2,5,0.5,0,0.5,3), 3, 3)
|
||||
# > r_pdf <- dmnorm(cbind(x,y,z), mu, Sigma)
|
||||
r_pdf = np.array([0.0002214706, 0.0013819953, 0.0049138692,
|
||||
0.0103803050, 0.0140250800])
|
||||
|
||||
x = np.linspace(0, 2, 5)
|
||||
y = 3 * x - 2
|
||||
z = x + np.cos(y)
|
||||
r = np.array([x, y, z]).T
|
||||
|
||||
mean = np.array([1, 3, 2], 'd')
|
||||
cov = np.array([[1, 2, 0], [2, 5, .5], [0, .5, 3]], 'd')
|
||||
|
||||
pdf = multivariate_normal.pdf(r, mean, cov)
|
||||
assert_allclose(pdf, r_pdf, atol=1e-10)
|
||||
|
||||
|
||||
def test_rvs_shape():
|
||||
# Check that rvs parses the mean and covariance correctly, and returns
|
||||
# an array of the right shape
|
||||
N = 300
|
||||
d = 4
|
||||
sample = multivariate_normal.rvs(mean=np.zeros(d), cov=1, size=N)
|
||||
assert_equal(sample.shape, (N, d))
|
||||
|
||||
sample = multivariate_normal.rvs(mean=None,
|
||||
cov=np.array([[2, .1], [.1, 1]]),
|
||||
size=N)
|
||||
assert_equal(sample.shape, (N, 2))
|
||||
|
||||
u = multivariate_normal(mean=0, cov=1)
|
||||
sample = u.rvs(N)
|
||||
assert_equal(sample.shape, (N, ))
|
||||
|
||||
|
||||
def test_large_sample():
|
||||
# Generate large sample and compare sample mean and sample covariance
|
||||
# with mean and covariance matrix.
|
||||
|
||||
np.random.seed(2846)
|
||||
|
||||
n = 3
|
||||
mean = np.random.randn(n)
|
||||
M = np.random.randn(n, n)
|
||||
cov = np.dot(M, M.T)
|
||||
size = 5000
|
||||
|
||||
sample = multivariate_normal.rvs(mean, cov, size)
|
||||
|
||||
assert_allclose(numpy.cov(sample.T), cov, rtol=1e-1)
|
||||
assert_allclose(sample.mean(0), mean, rtol=1e-1)
|
||||
|
||||
|
||||
def test_entropy():
|
||||
np.random.seed(2846)
|
||||
|
||||
n = 3
|
||||
mean = np.random.randn(n)
|
||||
M = np.random.randn(n, n)
|
||||
cov = np.dot(M, M.T)
|
||||
|
||||
rv = multivariate_normal(mean, cov)
|
||||
|
||||
# Check that frozen distribution agrees with entropy function
|
||||
assert_almost_equal(rv.entropy(), multivariate_normal.entropy(mean, cov))
|
||||
# Compare entropy with manually computed expression involving
|
||||
# the sum of the logs of the eigenvalues of the covariance matrix
|
||||
eigs = np.linalg.eig(cov)[0]
|
||||
desired = 1/2 * (n * (np.log(2*np.pi) + 1) + np.sum(np.log(eigs)))
|
||||
assert_almost_equal(desired, rv.entropy())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_module_suite()
|
@ -0,0 +1,193 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import TestCase, run_module_suite, assert_equal, \
|
||||
assert_array_equal
|
||||
|
||||
from scipy.stats import rankdata, tiecorrect
|
||||
|
||||
|
||||
class TestTieCorrect(TestCase):
|
||||
|
||||
def test_empty(self):
|
||||
"""An empty array requires no correction, should return 1.0."""
|
||||
ranks = np.array([], dtype=np.float64)
|
||||
c = tiecorrect(ranks)
|
||||
assert_equal(c, 1.0)
|
||||
|
||||
def test_one(self):
|
||||
"""A single element requires no correction, should return 1.0."""
|
||||
ranks = np.array([1.0], dtype=np.float64)
|
||||
c = tiecorrect(ranks)
|
||||
assert_equal(c, 1.0)
|
||||
|
||||
def test_no_correction(self):
|
||||
"""Arrays with no ties require no correction."""
|
||||
ranks = np.arange(2.0)
|
||||
c = tiecorrect(ranks)
|
||||
assert_equal(c, 1.0)
|
||||
ranks = np.arange(3.0)
|
||||
c = tiecorrect(ranks)
|
||||
assert_equal(c, 1.0)
|
||||
|
||||
def test_basic(self):
|
||||
"""Check a few basic examples of the tie correction factor."""
|
||||
# One tie of two elements
|
||||
ranks = np.array([1.0, 2.5, 2.5])
|
||||
c = tiecorrect(ranks)
|
||||
T = 2.0
|
||||
N = ranks.size
|
||||
expected = 1.0 - (T**3 - T) / (N**3 - N)
|
||||
assert_equal(c, expected)
|
||||
|
||||
# One tie of two elements (same as above, but tie is not at the end)
|
||||
ranks = np.array([1.5, 1.5, 3.0])
|
||||
c = tiecorrect(ranks)
|
||||
T = 2.0
|
||||
N = ranks.size
|
||||
expected = 1.0 - (T**3 - T) / (N**3 - N)
|
||||
assert_equal(c, expected)
|
||||
|
||||
# One tie of three elements
|
||||
ranks = np.array([1.0, 3.0, 3.0, 3.0])
|
||||
c = tiecorrect(ranks)
|
||||
T = 3.0
|
||||
N = ranks.size
|
||||
expected = 1.0 - (T**3 - T) / (N**3 - N)
|
||||
assert_equal(c, expected)
|
||||
|
||||
# Two ties, lengths 2 and 3.
|
||||
ranks = np.array([1.5, 1.5, 4.0, 4.0, 4.0])
|
||||
c = tiecorrect(ranks)
|
||||
T1 = 2.0
|
||||
T2 = 3.0
|
||||
N = ranks.size
|
||||
expected = 1.0 - ((T1**3 - T1) + (T2**3 - T2)) / (N**3 - N)
|
||||
assert_equal(c, expected)
|
||||
|
||||
|
||||
class TestRankData(TestCase):
|
||||
|
||||
def test_empty(self):
|
||||
"""stats.rankdata([]) should return an empty array."""
|
||||
a = np.array([], dtype=np.int)
|
||||
r = rankdata(a)
|
||||
assert_array_equal(r, np.array([], dtype=np.float64))
|
||||
r = rankdata([])
|
||||
assert_array_equal(r, np.array([], dtype=np.float64))
|
||||
|
||||
def test_one(self):
|
||||
"""Check stats.rankdata with an array of length 1."""
|
||||
data = [100]
|
||||
a = np.array(data, dtype=np.int)
|
||||
r = rankdata(a)
|
||||
assert_array_equal(r, np.array([1.0], dtype=np.float64))
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, np.array([1.0], dtype=np.float64))
|
||||
|
||||
def test_basic(self):
|
||||
"""Basic tests of stats.rankdata."""
|
||||
data = [100, 10, 50]
|
||||
expected = np.array([3.0, 1.0, 2.0], dtype=np.float64)
|
||||
a = np.array(data, dtype=np.int)
|
||||
r = rankdata(a)
|
||||
assert_array_equal(r, expected)
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, expected)
|
||||
|
||||
data = [40, 10, 30, 10, 50]
|
||||
expected = np.array([4.0, 1.5, 3.0, 1.5, 5.0], dtype=np.float64)
|
||||
a = np.array(data, dtype=np.int)
|
||||
r = rankdata(a)
|
||||
assert_array_equal(r, expected)
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, expected)
|
||||
|
||||
data = [20, 20, 20, 10, 10, 10]
|
||||
expected = np.array([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=np.float64)
|
||||
a = np.array(data, dtype=np.int)
|
||||
r = rankdata(a)
|
||||
assert_array_equal(r, expected)
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, expected)
|
||||
# The docstring states explicitly that the argument is flattened.
|
||||
a2d = a.reshape(2, 3)
|
||||
r = rankdata(a2d)
|
||||
assert_array_equal(r, expected)
|
||||
|
||||
def test_large_int(self):
|
||||
data = np.array([2**60, 2**60+1], dtype=np.uint64)
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, [1.0, 2.0])
|
||||
|
||||
data = np.array([2**60, 2**60+1], dtype=np.int64)
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, [1.0, 2.0])
|
||||
|
||||
data = np.array([2**60, -2**60+1], dtype=np.int64)
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, [2.0, 1.0])
|
||||
|
||||
def test_big_tie(self):
|
||||
for n in [10000, 100000, 1000000]:
|
||||
data = np.ones(n, dtype=int)
|
||||
r = rankdata(data)
|
||||
expected_rank = 0.5 * (n + 1)
|
||||
assert_array_equal(r, expected_rank * data,
|
||||
"test failed with n=%d" % n)
|
||||
|
||||
|
||||
_cases = (
|
||||
# values, method, expected
|
||||
([], 'average', []),
|
||||
([], 'min', []),
|
||||
([], 'max', []),
|
||||
([], 'dense', []),
|
||||
([], 'ordinal', []),
|
||||
#
|
||||
([100], 'average', [1.0]),
|
||||
([100], 'min', [1.0]),
|
||||
([100], 'max', [1.0]),
|
||||
([100], 'dense', [1.0]),
|
||||
([100], 'ordinal', [1.0]),
|
||||
#
|
||||
([100, 100, 100], 'average', [2.0, 2.0, 2.0]),
|
||||
([100, 100, 100], 'min', [1.0, 1.0, 1.0]),
|
||||
([100, 100, 100], 'max', [3.0, 3.0, 3.0]),
|
||||
([100, 100, 100], 'dense', [1.0, 1.0, 1.0]),
|
||||
([100, 100, 100], 'ordinal', [1.0, 2.0, 3.0]),
|
||||
#
|
||||
([100, 300, 200], 'average', [1.0, 3.0, 2.0]),
|
||||
([100, 300, 200], 'min', [1.0, 3.0, 2.0]),
|
||||
([100, 300, 200], 'max', [1.0, 3.0, 2.0]),
|
||||
([100, 300, 200], 'dense', [1.0, 3.0, 2.0]),
|
||||
([100, 300, 200], 'ordinal', [1.0, 3.0, 2.0]),
|
||||
#
|
||||
([100, 200, 300, 200], 'average', [1.0, 2.5, 4.0, 2.5]),
|
||||
([100, 200, 300, 200], 'min', [1.0, 2.0, 4.0, 2.0]),
|
||||
([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]),
|
||||
([100, 200, 300, 200], 'dense', [1.0, 2.0, 3.0, 2.0]),
|
||||
([100, 200, 300, 200], 'ordinal', [1.0, 2.0, 4.0, 3.0]),
|
||||
#
|
||||
([100, 200, 300, 200, 100], 'average', [1.5, 3.5, 5.0, 3.5, 1.5]),
|
||||
([100, 200, 300, 200, 100], 'min', [1.0, 3.0, 5.0, 3.0, 1.0]),
|
||||
([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]),
|
||||
([100, 200, 300, 200, 100], 'dense', [1.0, 2.0, 3.0, 2.0, 1.0]),
|
||||
([100, 200, 300, 200, 100], 'ordinal', [1.0, 3.0, 5.0, 4.0, 2.0]),
|
||||
#
|
||||
([10] * 30, 'ordinal', np.arange(1.0, 31.0)),
|
||||
)
|
||||
|
||||
|
||||
def test_cases():
|
||||
|
||||
def check_case(values, method, expected):
|
||||
r = rankdata(values, method=method)
|
||||
assert_array_equal(r, expected)
|
||||
|
||||
for values, method, expected in _cases:
|
||||
yield check_case, values, method, expected
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_module_suite()
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,91 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose, assert_equal, run_module_suite
|
||||
|
||||
from scipy.stats._tukeylambda_stats import tukeylambda_variance, \
|
||||
tukeylambda_kurtosis
|
||||
|
||||
|
||||
def test_tukeylambda_stats_known_exact():
|
||||
"""Compare results with some known exact formulas."""
|
||||
# Some exact values of the Tukey Lambda variance and kurtosis:
|
||||
# lambda var kurtosis
|
||||
# 0 pi**2/3 6/5 (logistic distribution)
|
||||
# 0.5 4 - pi (5/3 - pi/2)/(pi/4 - 1)**2 - 3
|
||||
# 1 1/3 -6/5 (uniform distribution on (-1,1))
|
||||
# 2 1/12 -6/5 (uniform distribution on (-1/2, 1/2))
|
||||
|
||||
# lambda = 0
|
||||
var = tukeylambda_variance(0)
|
||||
assert_allclose(var, np.pi**2 / 3, atol=1e-12)
|
||||
kurt = tukeylambda_kurtosis(0)
|
||||
assert_allclose(kurt, 1.2, atol=1e-10)
|
||||
|
||||
# lambda = 0.5
|
||||
var = tukeylambda_variance(0.5)
|
||||
assert_allclose(var, 4 - np.pi, atol=1e-12)
|
||||
kurt = tukeylambda_kurtosis(0.5)
|
||||
desired = (5./3 - np.pi/2) / (np.pi/4 - 1)**2 - 3
|
||||
assert_allclose(kurt, desired, atol=1e-10)
|
||||
|
||||
# lambda = 1
|
||||
var = tukeylambda_variance(1)
|
||||
assert_allclose(var, 1.0 / 3, atol=1e-12)
|
||||
kurt = tukeylambda_kurtosis(1)
|
||||
assert_allclose(kurt, -1.2, atol=1e-10)
|
||||
|
||||
# lambda = 2
|
||||
var = tukeylambda_variance(2)
|
||||
assert_allclose(var, 1.0 / 12, atol=1e-12)
|
||||
kurt = tukeylambda_kurtosis(2)
|
||||
assert_allclose(kurt, -1.2, atol=1e-10)
|
||||
|
||||
|
||||
def test_tukeylambda_stats_mpmath():
|
||||
"""Compare results with some values that were computed using mpmath."""
|
||||
a10 = dict(atol=1e-10, rtol=0)
|
||||
a12 = dict(atol=1e-12, rtol=0)
|
||||
data = [
|
||||
# lambda variance kurtosis
|
||||
[-0.1, 4.78050217874253547, 3.78559520346454510],
|
||||
[-0.0649, 4.16428023599895777, 2.52019675947435718],
|
||||
[-0.05, 3.93672267890775277, 2.13129793057777277],
|
||||
[-0.001, 3.30128380390964882, 1.21452460083542988],
|
||||
[0.001, 3.27850775649572176, 1.18560634779287585],
|
||||
[0.03125, 2.95927803254615800, 0.804487555161819980],
|
||||
[0.05, 2.78281053405464501, 0.611604043886644327],
|
||||
[0.0649, 2.65282386754100551, 0.476834119532774540],
|
||||
[1.2, 0.242153920578588346, -1.23428047169049726],
|
||||
[10.0, 0.00095237579757703597, 2.37810697355144933],
|
||||
[20.0, 0.00012195121951131043, 7.37654321002709531],
|
||||
]
|
||||
|
||||
for lam, var_expected, kurt_expected in data:
|
||||
var = tukeylambda_variance(lam)
|
||||
assert_allclose(var, var_expected, **a12)
|
||||
kurt = tukeylambda_kurtosis(lam)
|
||||
assert_allclose(kurt, kurt_expected, **a10)
|
||||
|
||||
# Test with vector arguments (most of the other tests are for single
|
||||
# values).
|
||||
lam, var_expected, kurt_expected = zip(*data)
|
||||
var = tukeylambda_variance(lam)
|
||||
assert_allclose(var, var_expected, **a12)
|
||||
kurt = tukeylambda_kurtosis(lam)
|
||||
assert_allclose(kurt, kurt_expected, **a10)
|
||||
|
||||
|
||||
def test_tukeylambda_stats_invalid():
|
||||
"""Test values of lambda outside the domains of the functions."""
|
||||
lam = [-1.0, -0.5]
|
||||
var = tukeylambda_variance(lam)
|
||||
assert_equal(var, np.array([np.nan, np.inf]))
|
||||
|
||||
lam = [-1.0, -0.25]
|
||||
kurt = tukeylambda_kurtosis(lam)
|
||||
assert_equal(kurt, np.array([np.nan, np.inf]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_module_suite()
|
@ -0,0 +1,47 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
import scipy.stats
|
||||
from scipy.special import i0
|
||||
|
||||
|
||||
def von_mises_cdf_series(k,x,p):
|
||||
x = float(x)
|
||||
s = np.sin(x)
|
||||
c = np.cos(x)
|
||||
sn = np.sin(p*x)
|
||||
cn = np.cos(p*x)
|
||||
R = 0
|
||||
V = 0
|
||||
for n in range(p-1,0,-1):
|
||||
sn, cn = sn*c - cn*s, cn*c + sn*s
|
||||
R = 1./(2*n/k + R)
|
||||
V = R*(sn/n+V)
|
||||
|
||||
return 0.5+x/(2*np.pi) + V/np.pi
|
||||
|
||||
|
||||
def von_mises_cdf_normalapprox(k,x,C1):
|
||||
b = np.sqrt(2/np.pi)*np.exp(k)/i0(k)
|
||||
z = b*np.sin(x/2.)
|
||||
return scipy.stats.norm.cdf(z)
|
||||
|
||||
|
||||
def von_mises_cdf(k,x):
|
||||
ix = 2*np.pi*np.round(x/(2*np.pi))
|
||||
x = x-ix
|
||||
k = float(k)
|
||||
|
||||
# These values should give 12 decimal digits
|
||||
CK = 50
|
||||
a = [28., 0.5, 100., 5.0]
|
||||
C1 = 50.1
|
||||
|
||||
if k < CK:
|
||||
p = int(np.ceil(a[0]+a[1]*k-a[2]/(k+a[3])))
|
||||
|
||||
F = np.clip(von_mises_cdf_series(k,x,p),0,1)
|
||||
else:
|
||||
F = von_mises_cdf_normalapprox(k,x,C1)
|
||||
|
||||
return F+ix
|
@ -0,0 +1,76 @@
|
||||
import numpy as np
|
||||
import scipy.stats
|
||||
from scipy.special import i0
|
||||
import numpy.testing
|
||||
cimport numpy as np
|
||||
|
||||
cdef extern from "math.h":
|
||||
double cos(double theta)
|
||||
double sin(double theta)
|
||||
|
||||
|
||||
cdef double von_mises_cdf_series(double k,double x,unsigned int p):
|
||||
cdef double s, c, sn, cn, R, V
|
||||
cdef unsigned int n
|
||||
s = sin(x)
|
||||
c = cos(x)
|
||||
sn = sin(p*x)
|
||||
cn = cos(p*x)
|
||||
R = 0
|
||||
V = 0
|
||||
for n in range(p-1,0,-1):
|
||||
sn, cn = sn*c - cn*s, cn*c + sn*s
|
||||
R = 1./(2*n/k + R)
|
||||
V = R*(sn/n+V)
|
||||
|
||||
return 0.5+x/(2*np.pi) + V/np.pi
|
||||
|
||||
def von_mises_cdf_normalapprox(k,x,C1):
|
||||
b = np.sqrt(2/np.pi)*np.exp(k)/i0(k)
|
||||
z = b*np.sin(x/2.)
|
||||
C = 24*k
|
||||
chi = z - z**3/((C-2*z**2-16)/3.-(z**4+7/4.*z**2+167./2)/(C+C1-z**2+3))**2
|
||||
return scipy.stats.norm.cdf(z)
|
||||
|
||||
cimport cython
|
||||
@cython.boundscheck(False)
|
||||
def von_mises_cdf(k,x):
|
||||
cdef np.ndarray[double, ndim=1] temp, temp_xs, temp_ks
|
||||
cdef unsigned int i, p
|
||||
cdef double a1, a2, a3, a4, C1, CK
|
||||
#k,x = np.broadcast_arrays(np.asarray(k),np.asarray(x))
|
||||
k = np.asarray(k)
|
||||
x = np.asarray(x)
|
||||
zerodim = k.ndim==0 and x.ndim==0
|
||||
|
||||
k = np.atleast_1d(k)
|
||||
x = np.atleast_1d(x)
|
||||
ix = np.round(x/(2*np.pi))
|
||||
x = x-ix*2*np.pi
|
||||
|
||||
# These values should give 12 decimal digits
|
||||
CK=50
|
||||
a1, a2, a3, a4 = [28., 0.5, 100., 5.0]
|
||||
C1 = 50.1
|
||||
|
||||
bx, bk = np.broadcast_arrays(x,k)
|
||||
result = np.empty(bx.shape,dtype=np.float)
|
||||
|
||||
c_small_k = bk<CK
|
||||
temp = result[c_small_k]
|
||||
temp_xs = bx[c_small_k].astype(np.float)
|
||||
temp_ks = bk[c_small_k].astype(np.float)
|
||||
for i in range(len(temp)):
|
||||
p = <int>(1+a1+a2*temp_ks[i]-a3/(temp_ks[i]+a4))
|
||||
temp[i] = von_mises_cdf_series(temp_ks[i],temp_xs[i],p)
|
||||
if temp[i]<0:
|
||||
temp[i]=0
|
||||
elif temp[i]>1:
|
||||
temp[i]=1
|
||||
result[c_small_k] = temp
|
||||
result[~c_small_k] = von_mises_cdf_normalapprox(bk[~c_small_k],bx[~c_small_k],C1)
|
||||
|
||||
if not zerodim:
|
||||
return result+ix
|
||||
else:
|
||||
return (result+ix)[0]
|
Loading…
Reference in New Issue