Updated from wafo.stats from scipy.stats
parent
5c84825641
commit
6b88f2d4cc
@ -1,12 +1,346 @@
|
|||||||
"""
|
"""
|
||||||
Statistics package in WAFO Toolbox.
|
==========================================
|
||||||
|
Statistical functions (:mod:`scipy.stats`)
|
||||||
|
==========================================
|
||||||
|
|
||||||
Readme - Readme file for module STATS in WAFO Toolbox
|
.. module:: scipy.stats
|
||||||
|
|
||||||
|
This module contains a large number of probability distributions as
|
||||||
|
well as a growing library of statistical functions.
|
||||||
|
|
||||||
|
Each included distribution is an instance of the class rv_continous:
|
||||||
|
For each given name the following methods are available:
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
rv_continuous
|
||||||
|
rv_continuous.pdf
|
||||||
|
rv_continuous.logpdf
|
||||||
|
rv_continuous.cdf
|
||||||
|
rv_continuous.logcdf
|
||||||
|
rv_continuous.sf
|
||||||
|
rv_continuous.logsf
|
||||||
|
rv_continuous.ppf
|
||||||
|
rv_continuous.isf
|
||||||
|
rv_continuous.moment
|
||||||
|
rv_continuous.stats
|
||||||
|
rv_continuous.entropy
|
||||||
|
rv_continuous.fit
|
||||||
|
rv_continuous.expect
|
||||||
|
|
||||||
|
Calling the instance as a function returns a frozen pdf whose shape,
|
||||||
|
location, and scale parameters are fixed.
|
||||||
|
|
||||||
|
Similarly, each discrete distribution is an instance of the class
|
||||||
|
rv_discrete:
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
rv_discrete
|
||||||
|
rv_discrete.rvs
|
||||||
|
rv_discrete.pmf
|
||||||
|
rv_discrete.logpmf
|
||||||
|
rv_discrete.cdf
|
||||||
|
rv_discrete.logcdf
|
||||||
|
rv_discrete.sf
|
||||||
|
rv_discrete.logsf
|
||||||
|
rv_discrete.ppf
|
||||||
|
rv_discrete.isf
|
||||||
|
rv_discrete.stats
|
||||||
|
rv_discrete.moment
|
||||||
|
rv_discrete.entropy
|
||||||
|
rv_discrete.expect
|
||||||
|
|
||||||
|
Continuous distributions
|
||||||
|
========================
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
alpha -- Alpha
|
||||||
|
anglit -- Anglit
|
||||||
|
arcsine -- Arcsine
|
||||||
|
beta -- Beta
|
||||||
|
betaprime -- Beta Prime
|
||||||
|
bradford -- Bradford
|
||||||
|
burr -- Burr
|
||||||
|
cauchy -- Cauchy
|
||||||
|
chi -- Chi
|
||||||
|
chi2 -- Chi-squared
|
||||||
|
cosine -- Cosine
|
||||||
|
dgamma -- Double Gamma
|
||||||
|
dweibull -- Double Weibull
|
||||||
|
erlang -- Erlang
|
||||||
|
expon -- Exponential
|
||||||
|
exponweib -- Exponentiated Weibull
|
||||||
|
exponpow -- Exponential Power
|
||||||
|
f -- F (Snecdor F)
|
||||||
|
fatiguelife -- Fatigue Life (Birnbaum-Sanders)
|
||||||
|
fisk -- Fisk
|
||||||
|
foldcauchy -- Folded Cauchy
|
||||||
|
foldnorm -- Folded Normal
|
||||||
|
frechet_r -- Frechet Right Sided, Extreme Value Type II (Extreme LB) or weibull_min
|
||||||
|
frechet_l -- Frechet Left Sided, Weibull_max
|
||||||
|
genlogistic -- Generalized Logistic
|
||||||
|
genpareto -- Generalized Pareto
|
||||||
|
genexpon -- Generalized Exponential
|
||||||
|
genextreme -- Generalized Extreme Value
|
||||||
|
gausshyper -- Gauss Hypergeometric
|
||||||
|
gamma -- Gamma
|
||||||
|
gengamma -- Generalized gamma
|
||||||
|
genhalflogistic -- Generalized Half Logistic
|
||||||
|
gilbrat -- Gilbrat
|
||||||
|
gompertz -- Gompertz (Truncated Gumbel)
|
||||||
|
gumbel_r -- Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I
|
||||||
|
gumbel_l -- Left Sided Gumbel, etc.
|
||||||
|
halfcauchy -- Half Cauchy
|
||||||
|
halflogistic -- Half Logistic
|
||||||
|
halfnorm -- Half Normal
|
||||||
|
hypsecant -- Hyperbolic Secant
|
||||||
|
invgamma -- Inverse Gamma
|
||||||
|
invgauss -- Inverse Gaussian
|
||||||
|
invweibull -- Inverse Weibull
|
||||||
|
johnsonsb -- Johnson SB
|
||||||
|
johnsonsu -- Johnson SU
|
||||||
|
ksone -- Kolmogorov-Smirnov one-sided (no stats)
|
||||||
|
kstwobign -- Kolmogorov-Smirnov two-sided test for Large N (no stats)
|
||||||
|
laplace -- Laplace
|
||||||
|
logistic -- Logistic
|
||||||
|
loggamma -- Log-Gamma
|
||||||
|
loglaplace -- Log-Laplace (Log Double Exponential)
|
||||||
|
lognorm -- Log-Normal
|
||||||
|
lomax -- Lomax (Pareto of the second kind)
|
||||||
|
maxwell -- Maxwell
|
||||||
|
mielke -- Mielke's Beta-Kappa
|
||||||
|
nakagami -- Nakagami
|
||||||
|
ncx2 -- Non-central chi-squared
|
||||||
|
ncf -- Non-central F
|
||||||
|
nct -- Non-central Student's T
|
||||||
|
norm -- Normal (Gaussian)
|
||||||
|
pareto -- Pareto
|
||||||
|
pearson3 -- Pearson type III
|
||||||
|
powerlaw -- Power-function
|
||||||
|
powerlognorm -- Power log normal
|
||||||
|
powernorm -- Power normal
|
||||||
|
rdist -- R-distribution
|
||||||
|
reciprocal -- Reciprocal
|
||||||
|
rayleigh -- Rayleigh
|
||||||
|
rice -- Rice
|
||||||
|
recipinvgauss -- Reciprocal Inverse Gaussian
|
||||||
|
semicircular -- Semicircular
|
||||||
|
t -- Student's T
|
||||||
|
triang -- Triangular
|
||||||
|
truncexpon -- Truncated Exponential
|
||||||
|
truncnorm -- Truncated Normal
|
||||||
|
tukeylambda -- Tukey-Lambda
|
||||||
|
uniform -- Uniform
|
||||||
|
vonmises -- Von-Mises (Circular)
|
||||||
|
wald -- Wald
|
||||||
|
weibull_min -- Minimum Weibull (see Frechet)
|
||||||
|
weibull_max -- Maximum Weibull (see Frechet)
|
||||||
|
wrapcauchy -- Wrapped Cauchy
|
||||||
|
|
||||||
|
Multivariate distributions
|
||||||
|
==========================
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
multivariate_normal -- Multivariate normal distribution
|
||||||
|
|
||||||
|
Discrete distributions
|
||||||
|
======================
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
bernoulli -- Bernoulli
|
||||||
|
binom -- Binomial
|
||||||
|
boltzmann -- Boltzmann (Truncated Discrete Exponential)
|
||||||
|
dlaplace -- Discrete Laplacian
|
||||||
|
geom -- Geometric
|
||||||
|
hypergeom -- Hypergeometric
|
||||||
|
logser -- Logarithmic (Log-Series, Series)
|
||||||
|
nbinom -- Negative Binomial
|
||||||
|
planck -- Planck (Discrete Exponential)
|
||||||
|
poisson -- Poisson
|
||||||
|
randint -- Discrete Uniform
|
||||||
|
skellam -- Skellam
|
||||||
|
zipf -- Zipf
|
||||||
|
|
||||||
|
Statistical functions
|
||||||
|
=====================
|
||||||
|
|
||||||
|
Several of these functions have a similar version in scipy.stats.mstats
|
||||||
|
which work for masked arrays.
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
describe -- Descriptive statistics
|
||||||
|
gmean -- Geometric mean
|
||||||
|
hmean -- Harmonic mean
|
||||||
|
kurtosis -- Fisher or Pearson kurtosis
|
||||||
|
kurtosistest --
|
||||||
|
mode -- Modal value
|
||||||
|
moment -- Central moment
|
||||||
|
normaltest --
|
||||||
|
skew -- Skewness
|
||||||
|
skewtest --
|
||||||
|
tmean -- Truncated arithmetic mean
|
||||||
|
tvar -- Truncated variance
|
||||||
|
tmin --
|
||||||
|
tmax --
|
||||||
|
tstd --
|
||||||
|
tsem --
|
||||||
|
nanmean -- Mean, ignoring NaN values
|
||||||
|
nanstd -- Standard deviation, ignoring NaN values
|
||||||
|
nanmedian -- Median, ignoring NaN values
|
||||||
|
variation -- Coefficient of variation
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
cumfreq _
|
||||||
|
histogram2 _
|
||||||
|
histogram _
|
||||||
|
itemfreq _
|
||||||
|
percentileofscore _
|
||||||
|
scoreatpercentile _
|
||||||
|
relfreq _
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
binned_statistic -- Compute a binned statistic for a set of data.
|
||||||
|
binned_statistic_2d -- Compute a 2-D binned statistic for a set of data.
|
||||||
|
binned_statistic_dd -- Compute a d-D binned statistic for a set of data.
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
obrientransform
|
||||||
|
signaltonoise
|
||||||
|
bayes_mvs
|
||||||
|
sem
|
||||||
|
zmap
|
||||||
|
zscore
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
threshold
|
||||||
|
trimboth
|
||||||
|
trim1
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
f_oneway
|
||||||
|
pearsonr
|
||||||
|
spearmanr
|
||||||
|
pointbiserialr
|
||||||
|
kendalltau
|
||||||
|
linregress
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
ttest_1samp
|
||||||
|
ttest_ind
|
||||||
|
ttest_rel
|
||||||
|
kstest
|
||||||
|
chisquare
|
||||||
|
power_divergence
|
||||||
|
ks_2samp
|
||||||
|
mannwhitneyu
|
||||||
|
tiecorrect
|
||||||
|
rankdata
|
||||||
|
ranksums
|
||||||
|
wilcoxon
|
||||||
|
kruskal
|
||||||
|
friedmanchisquare
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
ansari
|
||||||
|
bartlett
|
||||||
|
levene
|
||||||
|
shapiro
|
||||||
|
anderson
|
||||||
|
binom_test
|
||||||
|
fligner
|
||||||
|
mood
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
boxcox
|
||||||
|
boxcox_normmax
|
||||||
|
boxcox_llf
|
||||||
|
|
||||||
|
Contingency table functions
|
||||||
|
===========================
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
chi2_contingency
|
||||||
|
contingency.expected_freq
|
||||||
|
contingency.margins
|
||||||
|
fisher_exact
|
||||||
|
|
||||||
|
Plot-tests
|
||||||
|
==========
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
ppcc_max
|
||||||
|
ppcc_plot
|
||||||
|
probplot
|
||||||
|
boxcox_normplot
|
||||||
|
|
||||||
|
|
||||||
|
Masked statistics functions
|
||||||
|
===========================
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
|
||||||
|
stats.mstats
|
||||||
|
|
||||||
|
|
||||||
|
Univariate and multivariate kernel density estimation (:mod:`scipy.stats.kde`)
|
||||||
|
==============================================================================
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
gaussian_kde
|
||||||
|
|
||||||
|
For many more stat related functions install the software R and the
|
||||||
|
interface package rpy.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
from scipy.stats import *
|
from scipy.stats import *
|
||||||
from core import *
|
from .core import *
|
||||||
import distributions #@Reimport
|
from .stats import *
|
||||||
from wafo.stats.distributions import *
|
from .distributions import *
|
||||||
|
from .rv import *
|
||||||
|
from .morestats import *
|
||||||
|
from ._binned_statistic import *
|
||||||
|
from .kde import gaussian_kde
|
||||||
|
from . import mstats
|
||||||
|
from .contingency import chi2_contingency
|
||||||
|
from ._multivariate import *
|
||||||
|
from . import estimation
|
||||||
|
|
||||||
|
#remove vonmises_cython from __all__, I don't know why it is included
|
||||||
|
__all__ = [s for s in dir() if not (s.startswith('_') or s.endswith('cython'))]
|
||||||
|
#import distributions #@Reimport
|
||||||
|
#from wafo.stats.distributions import *
|
||||||
|
|
||||||
import estimation
|
|
@ -0,0 +1,762 @@
|
|||||||
|
#
|
||||||
|
# Author: Travis Oliphant 2002-2011 with contributions from
|
||||||
|
# SciPy Developers 2004-2011
|
||||||
|
#
|
||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
from scipy import special
|
||||||
|
from scipy.special import gammaln as gamln
|
||||||
|
|
||||||
|
from numpy import floor, ceil, log, exp, sqrt, log1p, expm1, tanh, cosh, sinh
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import numpy.random as mtrand
|
||||||
|
|
||||||
|
from ._distn_infrastructure import (
|
||||||
|
rv_discrete, _lazywhere, _ncx2_pdf, _ncx2_cdf)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'binom', 'bernoulli', 'nbinom', 'geom', 'hypergeom',
|
||||||
|
'logser', 'poisson', 'planck', 'boltzmann', 'randint',
|
||||||
|
'zipf', 'dlaplace', 'skellam'
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class binom_gen(rv_discrete):
|
||||||
|
"""A binomial discrete random variable.
|
||||||
|
|
||||||
|
%(before_notes)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The probability mass function for `binom` is::
|
||||||
|
|
||||||
|
binom.pmf(k) = choose(n, k) * p**k * (1-p)**(n-k)
|
||||||
|
|
||||||
|
for ``k`` in ``{0, 1,..., n}``.
|
||||||
|
|
||||||
|
`binom` takes ``n`` and ``p`` as shape parameters.
|
||||||
|
|
||||||
|
%(example)s
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _rvs(self, n, p):
|
||||||
|
return mtrand.binomial(n, p, self._size)
|
||||||
|
|
||||||
|
def _argcheck(self, n, p):
|
||||||
|
self.b = n
|
||||||
|
return (n >= 0) & (p >= 0) & (p <= 1)
|
||||||
|
|
||||||
|
def _logpmf(self, x, n, p):
|
||||||
|
k = floor(x)
|
||||||
|
combiln = (gamln(n+1) - (gamln(k+1) + gamln(n-k+1)))
|
||||||
|
return combiln + special.xlogy(k, p) + special.xlog1py(n-k, -p)
|
||||||
|
|
||||||
|
def _pmf(self, x, n, p):
|
||||||
|
return exp(self._logpmf(x, n, p))
|
||||||
|
|
||||||
|
def _cdf(self, x, n, p):
|
||||||
|
k = floor(x)
|
||||||
|
vals = special.bdtr(k, n, p)
|
||||||
|
return vals
|
||||||
|
|
||||||
|
def _sf(self, x, n, p):
|
||||||
|
k = floor(x)
|
||||||
|
return special.bdtrc(k, n, p)
|
||||||
|
|
||||||
|
def _ppf(self, q, n, p):
|
||||||
|
vals = ceil(special.bdtrik(q, n, p))
|
||||||
|
vals1 = vals-1
|
||||||
|
temp = special.bdtr(vals1, n, p)
|
||||||
|
return np.where(temp >= q, vals1, vals)
|
||||||
|
|
||||||
|
def _stats(self, n, p):
|
||||||
|
q = 1.0-p
|
||||||
|
mu = n * p
|
||||||
|
var = n * p * q
|
||||||
|
g1 = (q-p) / sqrt(n*p*q)
|
||||||
|
g2 = (1.0-6*p*q)/(n*p*q)
|
||||||
|
return mu, var, g1, g2
|
||||||
|
|
||||||
|
def _entropy(self, n, p):
|
||||||
|
k = np.r_[0:n + 1]
|
||||||
|
vals = self._pmf(k, n, p)
|
||||||
|
h = -np.sum(special.xlogy(vals, vals), axis=0)
|
||||||
|
return h
|
||||||
|
binom = binom_gen(name='binom')
|
||||||
|
|
||||||
|
|
||||||
|
class bernoulli_gen(binom_gen):
|
||||||
|
"""A Bernoulli discrete random variable.
|
||||||
|
|
||||||
|
%(before_notes)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The probability mass function for `bernoulli` is::
|
||||||
|
|
||||||
|
bernoulli.pmf(k) = 1-p if k = 0
|
||||||
|
= p if k = 1
|
||||||
|
|
||||||
|
for ``k`` in ``{0, 1}``.
|
||||||
|
|
||||||
|
`bernoulli` takes ``p`` as shape parameter.
|
||||||
|
|
||||||
|
%(example)s
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _rvs(self, p):
|
||||||
|
return binom_gen._rvs(self, 1, p)
|
||||||
|
|
||||||
|
def _argcheck(self, p):
|
||||||
|
return (p >= 0) & (p <= 1)
|
||||||
|
|
||||||
|
def _logpmf(self, x, p):
|
||||||
|
return binom._logpmf(x, 1, p)
|
||||||
|
|
||||||
|
def _pmf(self, x, p):
|
||||||
|
return binom._pmf(x, 1, p)
|
||||||
|
|
||||||
|
def _cdf(self, x, p):
|
||||||
|
return binom._cdf(x, 1, p)
|
||||||
|
|
||||||
|
def _sf(self, x, p):
|
||||||
|
return binom._sf(x, 1, p)
|
||||||
|
|
||||||
|
def _ppf(self, q, p):
|
||||||
|
return binom._ppf(q, 1, p)
|
||||||
|
|
||||||
|
def _stats(self, p):
|
||||||
|
return binom._stats(1, p)
|
||||||
|
|
||||||
|
def _entropy(self, p):
|
||||||
|
h = -special.xlogy(p, p) - special.xlogy(1 - p, 1 - p)
|
||||||
|
return h
|
||||||
|
bernoulli = bernoulli_gen(b=1, name='bernoulli')
|
||||||
|
|
||||||
|
|
||||||
|
class nbinom_gen(rv_discrete):
|
||||||
|
"""A negative binomial discrete random variable.
|
||||||
|
|
||||||
|
%(before_notes)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The probability mass function for `nbinom` is::
|
||||||
|
|
||||||
|
nbinom.pmf(k) = choose(k+n-1, n-1) * p**n * (1-p)**k
|
||||||
|
|
||||||
|
for ``k >= 0``.
|
||||||
|
|
||||||
|
`nbinom` takes ``n`` and ``p`` as shape parameters.
|
||||||
|
|
||||||
|
%(example)s
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _rvs(self, n, p):
|
||||||
|
return mtrand.negative_binomial(n, p, self._size)
|
||||||
|
|
||||||
|
def _argcheck(self, n, p):
|
||||||
|
return (n >= 0) & (p >= 0) & (p <= 1)
|
||||||
|
|
||||||
|
def _pmf(self, x, n, p):
|
||||||
|
return exp(self._logpmf(x, n, p))
|
||||||
|
|
||||||
|
def _logpmf(self, x, n, p):
|
||||||
|
coeff = gamln(n+x) - gamln(x+1) - gamln(n)
|
||||||
|
return coeff + n*log(p) + x*log1p(-p)
|
||||||
|
|
||||||
|
def _cdf(self, x, n, p):
|
||||||
|
k = floor(x)
|
||||||
|
return special.betainc(n, k+1, p)
|
||||||
|
|
||||||
|
def _sf_skip(self, x, n, p):
|
||||||
|
# skip because special.nbdtrc doesn't work for 0<n<1
|
||||||
|
k = floor(x)
|
||||||
|
return special.nbdtrc(k, n, p)
|
||||||
|
|
||||||
|
def _ppf(self, q, n, p):
|
||||||
|
vals = ceil(special.nbdtrik(q, n, p))
|
||||||
|
vals1 = (vals-1).clip(0.0, np.inf)
|
||||||
|
temp = self._cdf(vals1, n, p)
|
||||||
|
return np.where(temp >= q, vals1, vals)
|
||||||
|
|
||||||
|
def _stats(self, n, p):
|
||||||
|
Q = 1.0 / p
|
||||||
|
P = Q - 1.0
|
||||||
|
mu = n*P
|
||||||
|
var = n*P*Q
|
||||||
|
g1 = (Q+P)/sqrt(n*P*Q)
|
||||||
|
g2 = (1.0 + 6*P*Q) / (n*P*Q)
|
||||||
|
return mu, var, g1, g2
|
||||||
|
nbinom = nbinom_gen(name='nbinom')
|
||||||
|
|
||||||
|
|
||||||
|
class geom_gen(rv_discrete):
|
||||||
|
"""A geometric discrete random variable.
|
||||||
|
|
||||||
|
%(before_notes)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The probability mass function for `geom` is::
|
||||||
|
|
||||||
|
geom.pmf(k) = (1-p)**(k-1)*p
|
||||||
|
|
||||||
|
for ``k >= 1``.
|
||||||
|
|
||||||
|
`geom` takes ``p`` as shape parameter.
|
||||||
|
|
||||||
|
%(example)s
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _rvs(self, p):
|
||||||
|
return mtrand.geometric(p, size=self._size)
|
||||||
|
|
||||||
|
def _argcheck(self, p):
|
||||||
|
return (p <= 1) & (p >= 0)
|
||||||
|
|
||||||
|
def _pmf(self, k, p):
|
||||||
|
return np.power(1-p, k-1) * p
|
||||||
|
|
||||||
|
def _logpmf(self, k, p):
|
||||||
|
return (k-1)*log1p(-p) + log(p)
|
||||||
|
|
||||||
|
def _cdf(self, x, p):
|
||||||
|
k = floor(x)
|
||||||
|
return -expm1(log1p(-p)*k)
|
||||||
|
|
||||||
|
def _sf(self, x, p):
|
||||||
|
return np.exp(self._logsf(x, p))
|
||||||
|
|
||||||
|
def _logsf(self, x, p):
|
||||||
|
k = floor(x)
|
||||||
|
return k*log1p(-p)
|
||||||
|
|
||||||
|
def _ppf(self, q, p):
|
||||||
|
vals = ceil(log1p(-q)/log1p(-p))
|
||||||
|
temp = self._cdf(vals-1, p)
|
||||||
|
return np.where((temp >= q) & (vals > 0), vals-1, vals)
|
||||||
|
|
||||||
|
def _stats(self, p):
|
||||||
|
mu = 1.0/p
|
||||||
|
qr = 1.0-p
|
||||||
|
var = qr / p / p
|
||||||
|
g1 = (2.0-p) / sqrt(qr)
|
||||||
|
g2 = np.polyval([1, -6, 6], p)/(1.0-p)
|
||||||
|
return mu, var, g1, g2
|
||||||
|
geom = geom_gen(a=1, name='geom', longname="A geometric")
|
||||||
|
|
||||||
|
|
||||||
|
class hypergeom_gen(rv_discrete):
|
||||||
|
"""A hypergeometric discrete random variable.
|
||||||
|
|
||||||
|
The hypergeometric distribution models drawing objects from a bin.
|
||||||
|
M is the total number of objects, n is total number of Type I objects.
|
||||||
|
The random variate represents the number of Type I objects in N drawn
|
||||||
|
without replacement from the total population.
|
||||||
|
|
||||||
|
%(before_notes)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The probability mass function is defined as::
|
||||||
|
|
||||||
|
pmf(k, M, n, N) = choose(n, k) * choose(M - n, N - k) / choose(M, N),
|
||||||
|
for max(0, N - (M-n)) <= k <= min(n, N)
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> from scipy.stats import hypergeom
|
||||||
|
|
||||||
|
Suppose we have a collection of 20 animals, of which 7 are dogs. Then if
|
||||||
|
we want to know the probability of finding a given number of dogs if we
|
||||||
|
choose at random 12 of the 20 animals, we can initialize a frozen
|
||||||
|
distribution and plot the probability mass function:
|
||||||
|
|
||||||
|
>>> [M, n, N] = [20, 7, 12]
|
||||||
|
>>> rv = hypergeom(M, n, N)
|
||||||
|
>>> x = np.arange(0, n+1)
|
||||||
|
>>> pmf_dogs = rv.pmf(x)
|
||||||
|
|
||||||
|
>>> fig = plt.figure()
|
||||||
|
>>> ax = fig.add_subplot(111)
|
||||||
|
>>> ax.plot(x, pmf_dogs, 'bo')
|
||||||
|
>>> ax.vlines(x, 0, pmf_dogs, lw=2)
|
||||||
|
>>> ax.set_xlabel('# of dogs in our group of chosen animals')
|
||||||
|
>>> ax.set_ylabel('hypergeom PMF')
|
||||||
|
>>> plt.show()
|
||||||
|
|
||||||
|
Instead of using a frozen distribution we can also use `hypergeom`
|
||||||
|
methods directly. To for example obtain the cumulative distribution
|
||||||
|
function, use:
|
||||||
|
|
||||||
|
>>> prb = hypergeom.cdf(x, M, n, N)
|
||||||
|
|
||||||
|
And to generate random numbers:
|
||||||
|
|
||||||
|
>>> R = hypergeom.rvs(M, n, N, size=10)
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _rvs(self, M, n, N):
|
||||||
|
return mtrand.hypergeometric(n, M-n, N, size=self._size)
|
||||||
|
|
||||||
|
def _argcheck(self, M, n, N):
|
||||||
|
cond = rv_discrete._argcheck(self, M, n, N)
|
||||||
|
cond &= (n <= M) & (N <= M)
|
||||||
|
self.a = max(N-(M-n), 0)
|
||||||
|
self.b = min(n, N)
|
||||||
|
return cond
|
||||||
|
|
||||||
|
def _logpmf(self, k, M, n, N):
|
||||||
|
tot, good = M, n
|
||||||
|
bad = tot - good
|
||||||
|
return gamln(good+1) - gamln(good-k+1) - gamln(k+1) + gamln(bad+1) \
|
||||||
|
- gamln(bad-N+k+1) - gamln(N-k+1) - gamln(tot+1) + gamln(tot-N+1) \
|
||||||
|
+ gamln(N+1)
|
||||||
|
|
||||||
|
def _pmf(self, k, M, n, N):
|
||||||
|
# same as the following but numerically more precise
|
||||||
|
# return comb(good, k) * comb(bad, N-k) / comb(tot, N)
|
||||||
|
return exp(self._logpmf(k, M, n, N))
|
||||||
|
|
||||||
|
def _stats(self, M, n, N):
|
||||||
|
# tot, good, sample_size = M, n, N
|
||||||
|
# "wikipedia".replace('N', 'M').replace('n', 'N').replace('K', 'n')
|
||||||
|
M, n, N = 1.*M, 1.*n, 1.*N
|
||||||
|
m = M - n
|
||||||
|
p = n/M
|
||||||
|
mu = N*p
|
||||||
|
|
||||||
|
var = m*n*N*(M - N)*1.0/(M*M*(M-1))
|
||||||
|
g1 = (m - n)*(M-2*N) / (M-2.0) * sqrt((M-1.0) / (m*n*N*(M-N)))
|
||||||
|
|
||||||
|
g2 = M*(M+1) - 6.*N*(M-N) - 6.*n*m
|
||||||
|
g2 *= (M-1)*M*M
|
||||||
|
g2 += 6.*n*N*(M-N)*m*(5.*M-6)
|
||||||
|
g2 /= n * N * (M-N) * m * (M-2.) * (M-3.)
|
||||||
|
return mu, var, g1, g2
|
||||||
|
|
||||||
|
def _entropy(self, M, n, N):
|
||||||
|
k = np.r_[N - (M - n):min(n, N) + 1]
|
||||||
|
vals = self.pmf(k, M, n, N)
|
||||||
|
h = -np.sum(special.xlogy(vals, vals), axis=0)
|
||||||
|
return h
|
||||||
|
|
||||||
|
def _sf(self, k, M, n, N):
|
||||||
|
"""More precise calculation, 1 - cdf doesn't cut it."""
|
||||||
|
# This for loop is needed because `k` can be an array. If that's the
|
||||||
|
# case, the sf() method makes M, n and N arrays of the same shape. We
|
||||||
|
# therefore unpack all inputs args, so we can do the manual
|
||||||
|
# integration.
|
||||||
|
res = []
|
||||||
|
for quant, tot, good, draw in zip(k, M, n, N):
|
||||||
|
# Manual integration over probability mass function. More accurate
|
||||||
|
# than integrate.quad.
|
||||||
|
k2 = np.arange(quant + 1, draw + 1)
|
||||||
|
res.append(np.sum(self._pmf(k2, tot, good, draw)))
|
||||||
|
return np.asarray(res)
|
||||||
|
hypergeom = hypergeom_gen(name='hypergeom')
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME: Fails _cdfvec
|
||||||
|
class logser_gen(rv_discrete):
|
||||||
|
"""A Logarithmic (Log-Series, Series) discrete random variable.
|
||||||
|
|
||||||
|
%(before_notes)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The probability mass function for `logser` is::
|
||||||
|
|
||||||
|
logser.pmf(k) = - p**k / (k*log(1-p))
|
||||||
|
|
||||||
|
for ``k >= 1``.
|
||||||
|
|
||||||
|
`logser` takes ``p`` as shape parameter.
|
||||||
|
|
||||||
|
%(example)s
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _rvs(self, p):
|
||||||
|
# looks wrong for p>0.5, too few k=1
|
||||||
|
# trying to use generic is worse, no k=1 at all
|
||||||
|
return mtrand.logseries(p, size=self._size)
|
||||||
|
|
||||||
|
def _argcheck(self, p):
|
||||||
|
return (p > 0) & (p < 1)
|
||||||
|
|
||||||
|
def _pmf(self, k, p):
|
||||||
|
return -np.power(p, k) * 1.0 / k / log1p(- p)
|
||||||
|
|
||||||
|
def _stats(self, p):
|
||||||
|
r = log1p(-p)
|
||||||
|
mu = p / (p - 1.0) / r
|
||||||
|
mu2p = -p / r / (p - 1.0)**2
|
||||||
|
var = mu2p - mu*mu
|
||||||
|
mu3p = -p / r * (1.0+p) / (1.0 - p)**3
|
||||||
|
mu3 = mu3p - 3*mu*mu2p + 2*mu**3
|
||||||
|
g1 = mu3 / np.power(var, 1.5)
|
||||||
|
|
||||||
|
mu4p = -p / r * (
|
||||||
|
1.0 / (p-1)**2 - 6*p / (p - 1)**3 + 6*p*p / (p-1)**4)
|
||||||
|
mu4 = mu4p - 4*mu3p*mu + 6*mu2p*mu*mu - 3*mu**4
|
||||||
|
g2 = mu4 / var**2 - 3.0
|
||||||
|
return mu, var, g1, g2
|
||||||
|
logser = logser_gen(a=1, name='logser', longname='A logarithmic')
|
||||||
|
|
||||||
|
|
||||||
|
class poisson_gen(rv_discrete):
|
||||||
|
"""A Poisson discrete random variable.
|
||||||
|
|
||||||
|
%(before_notes)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The probability mass function for `poisson` is::
|
||||||
|
|
||||||
|
poisson.pmf(k) = exp(-mu) * mu**k / k!
|
||||||
|
|
||||||
|
for ``k >= 0``.
|
||||||
|
|
||||||
|
`poisson` takes ``mu`` as shape parameter.
|
||||||
|
|
||||||
|
%(example)s
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _rvs(self, mu):
|
||||||
|
return mtrand.poisson(mu, self._size)
|
||||||
|
|
||||||
|
def _logpmf(self, k, mu):
|
||||||
|
Pk = k*log(mu)-gamln(k+1) - mu
|
||||||
|
return Pk
|
||||||
|
|
||||||
|
def _pmf(self, k, mu):
|
||||||
|
return exp(self._logpmf(k, mu))
|
||||||
|
|
||||||
|
def _cdf(self, x, mu):
|
||||||
|
k = floor(x)
|
||||||
|
return special.pdtr(k, mu)
|
||||||
|
|
||||||
|
def _sf(self, x, mu):
|
||||||
|
k = floor(x)
|
||||||
|
return special.pdtrc(k, mu)
|
||||||
|
|
||||||
|
def _ppf(self, q, mu):
|
||||||
|
vals = ceil(special.pdtrik(q, mu))
|
||||||
|
vals1 = vals - 1
|
||||||
|
temp = special.pdtr(vals1, mu)
|
||||||
|
return np.where((temp >= q), vals1, vals)
|
||||||
|
|
||||||
|
def _stats(self, mu):
|
||||||
|
var = mu
|
||||||
|
tmp = np.asarray(mu)
|
||||||
|
g1 = sqrt(1.0 / tmp)
|
||||||
|
g2 = 1.0 / tmp
|
||||||
|
return mu, var, g1, g2
|
||||||
|
poisson = poisson_gen(name="poisson", longname='A Poisson')
|
||||||
|
|
||||||
|
|
||||||
|
class planck_gen(rv_discrete):
|
||||||
|
"""A Planck discrete exponential random variable.
|
||||||
|
|
||||||
|
%(before_notes)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The probability mass function for `planck` is::
|
||||||
|
|
||||||
|
planck.pmf(k) = (1-exp(-lambda_))*exp(-lambda_*k)
|
||||||
|
|
||||||
|
for ``k*lambda_ >= 0``.
|
||||||
|
|
||||||
|
`planck` takes ``lambda_`` as shape parameter.
|
||||||
|
|
||||||
|
%(example)s
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _argcheck(self, lambda_):
|
||||||
|
if (lambda_ > 0):
|
||||||
|
self.a = 0
|
||||||
|
self.b = np.inf
|
||||||
|
return 1
|
||||||
|
elif (lambda_ < 0):
|
||||||
|
self.a = -np.inf
|
||||||
|
self.b = 0
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def _pmf(self, k, lambda_):
|
||||||
|
fact = -expm1(-lambda_)
|
||||||
|
return fact * exp(-lambda_ * k)
|
||||||
|
|
||||||
|
def _cdf(self, x, lambda_):
|
||||||
|
k = floor(x)
|
||||||
|
return - expm1(-lambda_ * (k + 1))
|
||||||
|
|
||||||
|
def _ppf(self, q, lambda_):
|
||||||
|
vals = ceil(-1.0/lambda_ * log1p(-q)-1)
|
||||||
|
vals1 = (vals-1).clip(self.a, np.inf)
|
||||||
|
temp = self._cdf(vals1, lambda_)
|
||||||
|
return np.where(temp >= q, vals1, vals)
|
||||||
|
|
||||||
|
def _stats(self, lambda_):
|
||||||
|
mu = 1/(exp(lambda_)-1)
|
||||||
|
var = exp(-lambda_)/(expm1(-lambda_))**2
|
||||||
|
g1 = 2*cosh(lambda_/2.0)
|
||||||
|
g2 = 4+2*cosh(lambda_)
|
||||||
|
return mu, var, g1, g2
|
||||||
|
|
||||||
|
def _entropy(self, lambda_):
|
||||||
|
l = lambda_
|
||||||
|
C = -expm1(-l)
|
||||||
|
return l * exp(-l) / C - log(C)
|
||||||
|
planck = planck_gen(name='planck', longname='A discrete exponential ')
|
||||||
|
|
||||||
|
|
||||||
|
class boltzmann_gen(rv_discrete):
|
||||||
|
"""A Boltzmann (Truncated Discrete Exponential) random variable.
|
||||||
|
|
||||||
|
%(before_notes)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The probability mass function for `boltzmann` is::
|
||||||
|
|
||||||
|
boltzmann.pmf(k) = (1-exp(-lambda_)*exp(-lambda_*k)/(1-exp(-lambda_*N))
|
||||||
|
|
||||||
|
for ``k = 0,..., N-1``.
|
||||||
|
|
||||||
|
`boltzmann` takes ``lambda_`` and ``N`` as shape parameters.
|
||||||
|
|
||||||
|
%(example)s
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _pmf(self, k, lambda_, N):
|
||||||
|
fact = (expm1(-lambda_))/(expm1(-lambda_*N))
|
||||||
|
return fact*exp(-lambda_*k)
|
||||||
|
|
||||||
|
def _cdf(self, x, lambda_, N):
|
||||||
|
k = floor(x)
|
||||||
|
return (expm1(-lambda_*(k+1)))/(expm1(-lambda_*N))
|
||||||
|
|
||||||
|
def _ppf(self, q, lambda_, N):
|
||||||
|
qnew = -q*(expm1(-lambda_*N))
|
||||||
|
vals = ceil(-1.0/lambda_ * log1p(-qnew)-1)
|
||||||
|
vals1 = (vals-1).clip(0.0, np.inf)
|
||||||
|
temp = self._cdf(vals1, lambda_, N)
|
||||||
|
return np.where(temp >= q, vals1, vals)
|
||||||
|
|
||||||
|
def _stats(self, lambda_, N):
|
||||||
|
z = exp(-lambda_)
|
||||||
|
zN = exp(-lambda_*N)
|
||||||
|
mu = z/(1.0-z)-N*zN/(1-zN)
|
||||||
|
var = z/(1.0-z)**2 - N*N*zN/(1-zN)**2
|
||||||
|
trm = (1-zN)/(1-z)
|
||||||
|
trm2 = (z*trm**2 - N*N*zN)
|
||||||
|
g1 = z*(1+z)*trm**3 - N**3*zN*(1+zN)
|
||||||
|
g1 = g1 / trm2**(1.5)
|
||||||
|
g2 = z*(1+4*z+z*z)*trm**4 - N**4 * zN*(1+4*zN+zN*zN)
|
||||||
|
g2 = g2 / trm2 / trm2
|
||||||
|
return mu, var, g1, g2
|
||||||
|
boltzmann = boltzmann_gen(name='boltzmann',
|
||||||
|
longname='A truncated discrete exponential ')
|
||||||
|
|
||||||
|
|
||||||
|
class randint_gen(rv_discrete):
|
||||||
|
"""A uniform discrete random variable.
|
||||||
|
|
||||||
|
%(before_notes)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The probability mass function for `randint` is::
|
||||||
|
|
||||||
|
randint.pmf(k) = 1./(high - low)
|
||||||
|
|
||||||
|
for ``k = low, ..., high - 1``.
|
||||||
|
|
||||||
|
`randint` takes ``low`` and ``high`` as shape parameters.
|
||||||
|
|
||||||
|
Note the difference to the numpy ``random_integers`` which
|
||||||
|
returns integers on a *closed* interval ``[low, high]``.
|
||||||
|
|
||||||
|
%(example)s
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _argcheck(self, low, high):
|
||||||
|
self.a = low
|
||||||
|
self.b = high - 1
|
||||||
|
return (high > low)
|
||||||
|
|
||||||
|
def _pmf(self, k, low, high):
|
||||||
|
p = np.ones_like(k) / (high - low)
|
||||||
|
return np.where((k >= low) & (k < high), p, 0.)
|
||||||
|
|
||||||
|
def _cdf(self, x, low, high):
|
||||||
|
k = floor(x)
|
||||||
|
return (k - low + 1.) / (high - low)
|
||||||
|
|
||||||
|
def _ppf(self, q, low, high):
|
||||||
|
vals = ceil(q * (high - low) + low) - 1
|
||||||
|
vals1 = (vals - 1).clip(low, high)
|
||||||
|
temp = self._cdf(vals1, low, high)
|
||||||
|
return np.where(temp >= q, vals1, vals)
|
||||||
|
|
||||||
|
def _stats(self, low, high):
|
||||||
|
m2, m1 = np.asarray(high), np.asarray(low)
|
||||||
|
mu = (m2 + m1 - 1.0) / 2
|
||||||
|
d = m2 - m1
|
||||||
|
var = (d*d - 1) / 12.0
|
||||||
|
g1 = 0.0
|
||||||
|
g2 = -6.0/5.0 * (d*d + 1.0) / (d*d - 1.0)
|
||||||
|
return mu, var, g1, g2
|
||||||
|
|
||||||
|
def _rvs(self, low, high=None):
|
||||||
|
"""An array of *size* random integers >= ``low`` and < ``high``.
|
||||||
|
|
||||||
|
If ``high`` is ``None``, then range is >=0 and < low
|
||||||
|
"""
|
||||||
|
return mtrand.randint(low, high, self._size)
|
||||||
|
|
||||||
|
def _entropy(self, low, high):
|
||||||
|
return log(high - low)
|
||||||
|
randint = randint_gen(name='randint', longname='A discrete uniform '
|
||||||
|
'(random integer)')
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME: problems sampling.
|
||||||
|
class zipf_gen(rv_discrete):
|
||||||
|
"""A Zipf discrete random variable.
|
||||||
|
|
||||||
|
%(before_notes)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The probability mass function for `zipf` is::
|
||||||
|
|
||||||
|
zipf.pmf(k, a) = 1/(zeta(a) * k**a)
|
||||||
|
|
||||||
|
for ``k >= 1``.
|
||||||
|
|
||||||
|
`zipf` takes ``a`` as shape parameter.
|
||||||
|
|
||||||
|
%(example)s
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _rvs(self, a):
|
||||||
|
return mtrand.zipf(a, size=self._size)
|
||||||
|
|
||||||
|
def _argcheck(self, a):
|
||||||
|
return a > 1
|
||||||
|
|
||||||
|
def _pmf(self, k, a):
|
||||||
|
Pk = 1.0 / special.zeta(a, 1) / k**a
|
||||||
|
return Pk
|
||||||
|
|
||||||
|
def _munp(self, n, a):
|
||||||
|
return _lazywhere(
|
||||||
|
a > n + 1, (a, n),
|
||||||
|
lambda a, n: special.zeta(a - n, 1) / special.zeta(a, 1),
|
||||||
|
np.inf)
|
||||||
|
zipf = zipf_gen(a=1, name='zipf', longname='A Zipf')
|
||||||
|
|
||||||
|
|
||||||
|
class dlaplace_gen(rv_discrete):
|
||||||
|
"""A Laplacian discrete random variable.
|
||||||
|
|
||||||
|
%(before_notes)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
The probability mass function for `dlaplace` is::
|
||||||
|
|
||||||
|
dlaplace.pmf(k) = tanh(a/2) * exp(-a*abs(k))
|
||||||
|
|
||||||
|
for ``a > 0``.
|
||||||
|
|
||||||
|
`dlaplace` takes ``a`` as shape parameter.
|
||||||
|
|
||||||
|
%(example)s
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _pmf(self, k, a):
|
||||||
|
return tanh(a/2.0) * exp(-a * abs(k))
|
||||||
|
|
||||||
|
def _cdf(self, x, a):
|
||||||
|
k = floor(x)
|
||||||
|
f = lambda k, a: 1.0 - exp(-a * k) / (exp(a) + 1)
|
||||||
|
f2 = lambda k, a: exp(a * (k+1)) / (exp(a) + 1)
|
||||||
|
return _lazywhere(k >= 0, (k, a), f=f, f2=f2)
|
||||||
|
|
||||||
|
def _ppf(self, q, a):
|
||||||
|
const = 1 + exp(a)
|
||||||
|
vals = ceil(np.where(q < 1.0 / (1 + exp(-a)), log(q*const) / a - 1,
|
||||||
|
-log((1-q) * const) / a))
|
||||||
|
vals1 = vals - 1
|
||||||
|
return np.where(self._cdf(vals1, a) >= q, vals1, vals)
|
||||||
|
|
||||||
|
def _stats(self, a):
|
||||||
|
ea = exp(a)
|
||||||
|
mu2 = 2.*ea/(ea-1.)**2
|
||||||
|
mu4 = 2.*ea*(ea**2+10.*ea+1.) / (ea-1.)**4
|
||||||
|
return 0., mu2, 0., mu4/mu2**2 - 3.
|
||||||
|
|
||||||
|
def _entropy(self, a):
|
||||||
|
return a / sinh(a) - log(tanh(a/2.0))
|
||||||
|
dlaplace = dlaplace_gen(a=-np.inf,
|
||||||
|
name='dlaplace', longname='A discrete Laplacian')
|
||||||
|
|
||||||
|
|
||||||
|
class skellam_gen(rv_discrete):
|
||||||
|
"""A Skellam discrete random variable.
|
||||||
|
|
||||||
|
%(before_notes)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
Probability distribution of the difference of two correlated or
|
||||||
|
uncorrelated Poisson random variables.
|
||||||
|
|
||||||
|
Let k1 and k2 be two Poisson-distributed r.v. with expected values
|
||||||
|
lam1 and lam2. Then, ``k1 - k2`` follows a Skellam distribution with
|
||||||
|
parameters ``mu1 = lam1 - rho*sqrt(lam1*lam2)`` and
|
||||||
|
``mu2 = lam2 - rho*sqrt(lam1*lam2)``, where rho is the correlation
|
||||||
|
coefficient between k1 and k2. If the two Poisson-distributed r.v.
|
||||||
|
are independent then ``rho = 0``.
|
||||||
|
|
||||||
|
Parameters mu1 and mu2 must be strictly positive.
|
||||||
|
|
||||||
|
For details see: http://en.wikipedia.org/wiki/Skellam_distribution
|
||||||
|
|
||||||
|
`skellam` takes ``mu1`` and ``mu2`` as shape parameters.
|
||||||
|
|
||||||
|
%(example)s
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _rvs(self, mu1, mu2):
|
||||||
|
n = self._size
|
||||||
|
return mtrand.poisson(mu1, n) - mtrand.poisson(mu2, n)
|
||||||
|
|
||||||
|
def _pmf(self, x, mu1, mu2):
|
||||||
|
px = np.where(x < 0,
|
||||||
|
_ncx2_pdf(2*mu2, 2*(1-x), 2*mu1)*2,
|
||||||
|
_ncx2_pdf(2*mu1, 2*(1+x), 2*mu2)*2)
|
||||||
|
# ncx2.pdf() returns nan's for extremely low probabilities
|
||||||
|
return px
|
||||||
|
|
||||||
|
def _cdf(self, x, mu1, mu2):
|
||||||
|
x = floor(x)
|
||||||
|
px = np.where(x < 0,
|
||||||
|
_ncx2_cdf(2*mu2, -2*x, 2*mu1),
|
||||||
|
1-_ncx2_cdf(2*mu1, 2*(x+1), 2*mu2))
|
||||||
|
return px
|
||||||
|
|
||||||
|
def _stats(self, mu1, mu2):
|
||||||
|
mean = mu1 - mu2
|
||||||
|
var = mu1 + mu2
|
||||||
|
g1 = mean / sqrt((var)**3)
|
||||||
|
g2 = 1 / var
|
||||||
|
return mean, var, g1, g2
|
||||||
|
skellam = skellam_gen(a=-np.inf, name="skellam", longname='A Skellam')
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,493 @@
|
|||||||
|
#
|
||||||
|
# Author: Joris Vankerschaver 2013
|
||||||
|
#
|
||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
from scipy.misc import doccer
|
||||||
|
from functools import wraps
|
||||||
|
import numpy as np
|
||||||
|
import scipy.linalg
|
||||||
|
|
||||||
|
__all__ = ['multivariate_normal']
|
||||||
|
|
||||||
|
|
||||||
|
_LOG_2PI = np.log(2 * np.pi)
|
||||||
|
|
||||||
|
|
||||||
|
def _process_parameters(dim, mean, cov):
|
||||||
|
"""
|
||||||
|
Infer dimensionality from mean or covariance matrix, ensure that
|
||||||
|
mean and covariance are full vector resp. matrix.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Try to infer dimensionality
|
||||||
|
if dim is None:
|
||||||
|
if mean is None:
|
||||||
|
if cov is None:
|
||||||
|
dim = 1
|
||||||
|
else:
|
||||||
|
cov = np.asarray(cov, dtype=float)
|
||||||
|
if cov.ndim < 2:
|
||||||
|
dim = 1
|
||||||
|
else:
|
||||||
|
dim = cov.shape[0]
|
||||||
|
else:
|
||||||
|
mean = np.asarray(mean, dtype=float)
|
||||||
|
dim = mean.size
|
||||||
|
else:
|
||||||
|
if not np.isscalar(dim):
|
||||||
|
raise ValueError("Dimension of random variable must be a scalar.")
|
||||||
|
|
||||||
|
# Check input sizes and return full arrays for mean and cov if necessary
|
||||||
|
if mean is None:
|
||||||
|
mean = np.zeros(dim)
|
||||||
|
mean = np.asarray(mean, dtype=float)
|
||||||
|
|
||||||
|
if cov is None:
|
||||||
|
cov = 1.0
|
||||||
|
cov = np.asarray(cov, dtype=float)
|
||||||
|
|
||||||
|
if dim == 1:
|
||||||
|
mean.shape = (1,)
|
||||||
|
cov.shape = (1, 1)
|
||||||
|
|
||||||
|
if mean.ndim != 1 or mean.shape[0] != dim:
|
||||||
|
raise ValueError("Array 'mean' must be vector of length %d." % dim)
|
||||||
|
if cov.ndim == 0:
|
||||||
|
cov = cov * np.eye(dim)
|
||||||
|
elif cov.ndim == 1:
|
||||||
|
cov = np.diag(cov)
|
||||||
|
else:
|
||||||
|
if cov.shape != (dim, dim):
|
||||||
|
raise ValueError("Array 'cov' must be at most two-dimensional,"
|
||||||
|
" but cov.ndim = %d" % cov.ndim)
|
||||||
|
|
||||||
|
return dim, mean, cov
|
||||||
|
|
||||||
|
|
||||||
|
def _process_quantiles(x, dim):
|
||||||
|
"""
|
||||||
|
Adjust quantiles array so that last axis labels the components of
|
||||||
|
each data point.
|
||||||
|
|
||||||
|
"""
|
||||||
|
x = np.asarray(x, dtype=float)
|
||||||
|
|
||||||
|
if x.ndim == 0:
|
||||||
|
x = x[np.newaxis]
|
||||||
|
elif x.ndim == 1:
|
||||||
|
if dim == 1:
|
||||||
|
x = x[:, np.newaxis]
|
||||||
|
else:
|
||||||
|
x = x[np.newaxis, :]
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def _squeeze_output(out):
|
||||||
|
"""
|
||||||
|
Remove single-dimensional entries from array and convert to scalar,
|
||||||
|
if necessary.
|
||||||
|
|
||||||
|
"""
|
||||||
|
out = out.squeeze()
|
||||||
|
if out.ndim == 0:
|
||||||
|
out = out[()]
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _pinv_1d(v, eps=1e-5):
|
||||||
|
"""
|
||||||
|
A helper function for computing the pseudoinverse.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
v : iterable of numbers
|
||||||
|
This may be thought of as a vector of eigenvalues or singular values.
|
||||||
|
eps : float
|
||||||
|
Elements of v smaller than eps are considered negligible.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
v_pinv : 1d float ndarray
|
||||||
|
A vector of pseudo-inverted numbers.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return np.array([0 if abs(x) < eps else 1/x for x in v], dtype=float)
|
||||||
|
|
||||||
|
|
||||||
|
def _psd_pinv_decomposed_log_pdet(mat, cond=None, rcond=None,
|
||||||
|
lower=True, check_finite=True):
|
||||||
|
"""
|
||||||
|
Compute a decomposition of the pseudo-inverse and the logarithm of
|
||||||
|
the pseudo-determinant of a symmetric positive semi-definite
|
||||||
|
matrix.
|
||||||
|
|
||||||
|
The pseudo-determinant of a matrix is defined as the product of
|
||||||
|
the non-zero eigenvalues, and coincides with the usual determinant
|
||||||
|
for a full matrix.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mat : array_like
|
||||||
|
Input array of shape (`m`, `n`)
|
||||||
|
cond, rcond : float or None
|
||||||
|
Cutoff for 'small' singular values.
|
||||||
|
Eigenvalues smaller than ``rcond*largest_eigenvalue``
|
||||||
|
are considered zero.
|
||||||
|
If None or -1, suitable machine precision is used.
|
||||||
|
lower : bool, optional
|
||||||
|
Whether the pertinent array data is taken from the lower or upper
|
||||||
|
triangle of `mat`. (Default: lower)
|
||||||
|
check_finite : boolean, optional
|
||||||
|
Whether to check that the input matrix contains only finite numbers.
|
||||||
|
Disabling may give a performance gain, but may result in problems
|
||||||
|
(crashes, non-termination) if the inputs do contain infinities or NaNs.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
M : array_like
|
||||||
|
The pseudo-inverse of the input matrix is np.dot(M, M.T).
|
||||||
|
log_pdet : float
|
||||||
|
Logarithm of the pseudo-determinant of the matrix.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Compute the symmetric eigendecomposition.
|
||||||
|
# The input covariance matrix is required to be real symmetric
|
||||||
|
# and positive semidefinite which implies that its eigenvalues
|
||||||
|
# are all real and non-negative,
|
||||||
|
# but clip them anyway to avoid numerical issues.
|
||||||
|
|
||||||
|
# TODO: the code to set cond/rcond is identical to that in
|
||||||
|
# scipy.linalg.{pinvh, pinv2} and if/when this function is subsumed
|
||||||
|
# into scipy.linalg it should probably be shared between all of
|
||||||
|
# these routines.
|
||||||
|
|
||||||
|
# Note that eigh takes care of array conversion, chkfinite,
|
||||||
|
# and assertion that the matrix is square.
|
||||||
|
s, u = scipy.linalg.eigh(mat, lower=lower, check_finite=check_finite)
|
||||||
|
|
||||||
|
if rcond is not None:
|
||||||
|
cond = rcond
|
||||||
|
if cond in [None, -1]:
|
||||||
|
t = u.dtype.char.lower()
|
||||||
|
factor = {'f': 1E3, 'd': 1E6}
|
||||||
|
cond = factor[t] * np.finfo(t).eps
|
||||||
|
eps = cond * np.max(abs(s))
|
||||||
|
|
||||||
|
if np.min(s) < -eps:
|
||||||
|
raise ValueError('the covariance matrix must be positive semidefinite')
|
||||||
|
|
||||||
|
s_pinv = _pinv_1d(s, eps)
|
||||||
|
U = np.multiply(u, np.sqrt(s_pinv))
|
||||||
|
log_pdet = np.sum(np.log(s[s > eps]))
|
||||||
|
|
||||||
|
return U, log_pdet
|
||||||
|
|
||||||
|
|
||||||
|
_doc_default_callparams = \
|
||||||
|
"""mean : array_like, optional
|
||||||
|
Mean of the distribution (default zero)
|
||||||
|
cov : array_like, optional
|
||||||
|
Covariance matrix of the distribution (default one)
|
||||||
|
"""
|
||||||
|
|
||||||
|
_doc_callparams_note = \
|
||||||
|
"""Setting the parameter `mean` to `None` is equivalent to having `mean`
|
||||||
|
be the zero-vector. The parameter `cov` can be a scalar, in which case
|
||||||
|
the covariance matrix is the identity times that value, a vector of
|
||||||
|
diagonal entries for the covariance matrix, or a two-dimensional
|
||||||
|
array_like.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_doc_frozen_callparams = ""
|
||||||
|
|
||||||
|
_doc_frozen_callparams_note = \
|
||||||
|
"""See class definition for a detailed description of parameters."""
|
||||||
|
|
||||||
|
docdict_params = {
|
||||||
|
'_doc_default_callparams': _doc_default_callparams,
|
||||||
|
'_doc_callparams_note': _doc_callparams_note
|
||||||
|
}
|
||||||
|
|
||||||
|
docdict_noparams = {
|
||||||
|
'_doc_default_callparams': _doc_frozen_callparams,
|
||||||
|
'_doc_callparams_note': _doc_frozen_callparams_note
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class multivariate_normal_gen(object):
|
||||||
|
r"""
|
||||||
|
A multivariate normal random variable.
|
||||||
|
|
||||||
|
The `mean` keyword specifies the mean. The `cov` keyword specifies the
|
||||||
|
covariance matrix.
|
||||||
|
|
||||||
|
.. versionadded:: 0.14.0
|
||||||
|
|
||||||
|
Methods
|
||||||
|
-------
|
||||||
|
pdf(x, mean=None, cov=1)
|
||||||
|
Probability density function.
|
||||||
|
logpdf(x, mean=None, cov=1)
|
||||||
|
Log of the probability density function.
|
||||||
|
rvs(mean=None, cov=1)
|
||||||
|
Draw random samples from a multivariate normal distribution.
|
||||||
|
entropy()
|
||||||
|
Compute the differential entropy of the multivariate normal.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : array_like
|
||||||
|
Quantiles, with the last axis of `x` denoting the components.
|
||||||
|
%(_doc_default_callparams)s
|
||||||
|
|
||||||
|
Alternatively, the object may be called (as a function) to fix the mean
|
||||||
|
and covariance parameters, returning a "frozen" multivariate normal
|
||||||
|
random variable:
|
||||||
|
|
||||||
|
rv = multivariate_normal(mean=None, scale=1)
|
||||||
|
- Frozen object with the same methods but holding the given
|
||||||
|
mean and covariance fixed.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
%(_doc_callparams_note)s
|
||||||
|
|
||||||
|
The covariance matrix `cov` must be a (symmetric) positive
|
||||||
|
semi-definite matrix. The determinant and inverse of `cov` are computed
|
||||||
|
as the pseudo-determinant and pseudo-inverse, respectively, so
|
||||||
|
that `cov` does not need to have full rank.
|
||||||
|
|
||||||
|
The probability density function for `multivariate_normal` is
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
|
||||||
|
f(x) = \frac{1}{\sqrt{(2 \pi)^k \det \Sigma}} \exp\left( -\frac{1}{2} (x - \mu)^T \Sigma^{-1} (x - \mu) \right),
|
||||||
|
|
||||||
|
where :math:`\mu` is the mean, :math:`\Sigma` the covariance matrix,
|
||||||
|
and :math:`k` is the dimension of the space where :math:`x` takes values.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> from scipy.stats import multivariate_normal
|
||||||
|
>>> x = np.linspace(0, 5, 10, endpoint=False)
|
||||||
|
>>> y = multivariate_normal.pdf(x, mean=2.5, cov=0.5); y
|
||||||
|
array([ 0.00108914, 0.01033349, 0.05946514, 0.20755375, 0.43939129,
|
||||||
|
0.56418958, 0.43939129, 0.20755375, 0.05946514, 0.01033349])
|
||||||
|
>>> plt.plot(x, y)
|
||||||
|
|
||||||
|
The input quantiles can be any shape of array, as long as the last
|
||||||
|
axis labels the components. This allows us for instance to
|
||||||
|
display the frozen pdf for a non-isotropic random variable in 2D as
|
||||||
|
follows:
|
||||||
|
|
||||||
|
>>> x, y = np.mgrid[-1:1:.01, -1:1:.01]
|
||||||
|
>>> pos = np.empty(x.shape + (2,))
|
||||||
|
>>> pos[:, :, 0] = x; pos[:, :, 1] = y
|
||||||
|
>>> rv = multivariate_normal([0.5, -0.2], [[2.0, 0.3], [0.3, 0.5]])
|
||||||
|
>>> plt.contourf(x, y, rv.pdf(pos))
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.__doc__ = doccer.docformat(self.__doc__, docdict_params)
|
||||||
|
|
||||||
|
def __call__(self, mean=None, cov=1):
|
||||||
|
"""
|
||||||
|
Create a frozen multivariate normal distribution.
|
||||||
|
|
||||||
|
See `multivariate_normal_frozen` for more information.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return multivariate_normal_frozen(mean, cov)
|
||||||
|
|
||||||
|
def _logpdf(self, x, mean, prec_U, log_det_cov):
|
||||||
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : ndarray
|
||||||
|
Points at which to evaluate the log of the probability
|
||||||
|
density function
|
||||||
|
mean : ndarray
|
||||||
|
Mean of the distribution
|
||||||
|
prec_U : ndarray
|
||||||
|
A decomposition such that np.dot(prec_U, prec_U.T)
|
||||||
|
is the precision matrix, i.e. inverse of the covariance matrix.
|
||||||
|
log_det_cov : float
|
||||||
|
Logarithm of the determinant of the covariance matrix
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
As this function does no argument checking, it should not be
|
||||||
|
called directly; use 'logpdf' instead.
|
||||||
|
|
||||||
|
"""
|
||||||
|
dim = x.shape[-1]
|
||||||
|
dev = x - mean
|
||||||
|
maha = np.sum(np.square(np.dot(dev, prec_U)), axis=-1)
|
||||||
|
return -0.5 * (dim * _LOG_2PI + log_det_cov + maha)
|
||||||
|
|
||||||
|
def logpdf(self, x, mean, cov):
|
||||||
|
"""
|
||||||
|
Log of the multivariate normal probability density function.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : array_like
|
||||||
|
Quantiles, with the last axis of `x` denoting the components.
|
||||||
|
%(_doc_default_callparams)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
%(_doc_callparams_note)s
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pdf : ndarray
|
||||||
|
Log of the probability density function evaluated at `x`
|
||||||
|
|
||||||
|
"""
|
||||||
|
dim, mean, cov = _process_parameters(None, mean, cov)
|
||||||
|
x = _process_quantiles(x, dim)
|
||||||
|
prec_U, log_det_cov = _psd_pinv_decomposed_log_pdet(cov)
|
||||||
|
out = self._logpdf(x, mean, prec_U, log_det_cov)
|
||||||
|
return _squeeze_output(out)
|
||||||
|
|
||||||
|
def pdf(self, x, mean, cov):
|
||||||
|
"""
|
||||||
|
Multivariate normal probability density function.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : array_like
|
||||||
|
Quantiles, with the last axis of `x` denoting the components.
|
||||||
|
%(_doc_default_callparams)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
%(_doc_callparams_note)s
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pdf : ndarray
|
||||||
|
Probability density function evaluated at `x`
|
||||||
|
|
||||||
|
"""
|
||||||
|
dim, mean, cov = _process_parameters(None, mean, cov)
|
||||||
|
x = _process_quantiles(x, dim)
|
||||||
|
prec_U, log_det_cov = _psd_pinv_decomposed_log_pdet(cov)
|
||||||
|
out = np.exp(self._logpdf(x, mean, prec_U, log_det_cov))
|
||||||
|
return _squeeze_output(out)
|
||||||
|
|
||||||
|
def rvs(self, mean=None, cov=1, size=1):
|
||||||
|
"""
|
||||||
|
Draw random samples from a multivariate normal distribution.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
%(_doc_default_callparams)s
|
||||||
|
size : integer, optional
|
||||||
|
Number of samples to draw (default 1).
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
%(_doc_callparams_note)s
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
rvs : ndarray or scalar
|
||||||
|
Random variates of size (`size`, `N`), where `N` is the
|
||||||
|
dimension of the random variable.
|
||||||
|
|
||||||
|
"""
|
||||||
|
dim, mean, cov = _process_parameters(None, mean, cov)
|
||||||
|
out = np.random.multivariate_normal(mean, cov, size)
|
||||||
|
return _squeeze_output(out)
|
||||||
|
|
||||||
|
def entropy(self, mean=None, cov=1):
|
||||||
|
"""
|
||||||
|
Compute the differential entropy of the multivariate normal.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
%(_doc_default_callparams)s
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
%(_doc_callparams_note)s
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
h : scalar
|
||||||
|
Entropy of the multivariate normal distribution
|
||||||
|
|
||||||
|
"""
|
||||||
|
dim, mean, cov = _process_parameters(None, mean, cov)
|
||||||
|
return 1/2 * np.log(np.linalg.det(2 * np.pi * np.e * cov))
|
||||||
|
|
||||||
|
multivariate_normal = multivariate_normal_gen()
|
||||||
|
|
||||||
|
|
||||||
|
class multivariate_normal_frozen(object):
|
||||||
|
def __init__(self, mean=None, cov=1):
|
||||||
|
"""
|
||||||
|
Create a frozen multivariate normal distribution.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mean : array_like, optional
|
||||||
|
Mean of the distribution (default zero)
|
||||||
|
cov : array_like, optional
|
||||||
|
Covariance matrix of the distribution (default one)
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
When called with the default parameters, this will create a 1D random
|
||||||
|
variable with mean 0 and covariance 1:
|
||||||
|
|
||||||
|
>>> from scipy.stats import multivariate_normal
|
||||||
|
>>> r = multivariate_normal()
|
||||||
|
>>> r.mean
|
||||||
|
array([ 0.])
|
||||||
|
>>> r.cov
|
||||||
|
array([[1.]])
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.dim, self.mean, self.cov = _process_parameters(None, mean, cov)
|
||||||
|
self.prec_U, self._log_det_cov = _psd_pinv_decomposed_log_pdet(self.cov)
|
||||||
|
|
||||||
|
self._mnorm = multivariate_normal_gen()
|
||||||
|
|
||||||
|
def logpdf(self, x):
|
||||||
|
x = _process_quantiles(x, self.dim)
|
||||||
|
out = self._mnorm._logpdf(x, self.mean, self.prec_U, self._log_det_cov)
|
||||||
|
return _squeeze_output(out)
|
||||||
|
|
||||||
|
def pdf(self, x):
|
||||||
|
return np.exp(self.logpdf(x))
|
||||||
|
|
||||||
|
def rvs(self, size=1):
|
||||||
|
return self._mnorm.rvs(self.mean, self.cov, size)
|
||||||
|
|
||||||
|
def entropy(self):
|
||||||
|
"""
|
||||||
|
Computes the differential entropy of the multivariate normal.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
h : scalar
|
||||||
|
Entropy of the multivariate normal distribution
|
||||||
|
|
||||||
|
"""
|
||||||
|
return 1/2 * (self.dim * (_LOG_2PI + 1) + self._log_det_cov)
|
||||||
|
|
||||||
|
|
||||||
|
# Set frozen generator docstrings from corresponding docstrings in
|
||||||
|
# multivariate_normal_gen and fill in default strings in class docstrings
|
||||||
|
for name in ['logpdf', 'pdf', 'rvs']:
|
||||||
|
method = multivariate_normal_gen.__dict__[name]
|
||||||
|
method_frozen = multivariate_normal_frozen.__dict__[name]
|
||||||
|
method_frozen.__doc__ = doccer.docformat(method.__doc__, docdict_noparams)
|
||||||
|
method.__doc__ = doccer.docformat(method.__doc__, docdict_params)
|
@ -0,0 +1,201 @@
|
|||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from numpy import poly1d
|
||||||
|
from scipy.special import beta
|
||||||
|
|
||||||
|
|
||||||
|
# The following code was used to generate the Pade coefficients for the
|
||||||
|
# Tukey Lambda variance function. Version 0.17 of mpmath was used.
|
||||||
|
#---------------------------------------------------------------------------
|
||||||
|
# import mpmath as mp
|
||||||
|
#
|
||||||
|
# mp.mp.dps = 60
|
||||||
|
#
|
||||||
|
# one = mp.mpf(1)
|
||||||
|
# two = mp.mpf(2)
|
||||||
|
#
|
||||||
|
# def mpvar(lam):
|
||||||
|
# if lam == 0:
|
||||||
|
# v = mp.pi**2 / three
|
||||||
|
# else:
|
||||||
|
# v = (two / lam**2) * (one / (one + two*lam) -
|
||||||
|
# mp.beta(lam + one, lam + one))
|
||||||
|
# return v
|
||||||
|
#
|
||||||
|
# t = mp.taylor(mpvar, 0, 8)
|
||||||
|
# p, q = mp.pade(t, 4, 4)
|
||||||
|
# print "p =", [mp.fp.mpf(c) for c in p]
|
||||||
|
# print "q =", [mp.fp.mpf(c) for c in q]
|
||||||
|
#---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Pade coefficients for the Tukey Lambda variance function.
|
||||||
|
_tukeylambda_var_pc = [3.289868133696453, 0.7306125098871127,
|
||||||
|
-0.5370742306855439, 0.17292046290190008,
|
||||||
|
-0.02371146284628187]
|
||||||
|
_tukeylambda_var_qc = [1.0, 3.683605511659861, 4.184152498888124,
|
||||||
|
1.7660926747377275, 0.2643989311168465]
|
||||||
|
|
||||||
|
# numpy.poly1d instances for the numerator and denominator of the
|
||||||
|
# Pade approximation to the Tukey Lambda variance.
|
||||||
|
_tukeylambda_var_p = poly1d(_tukeylambda_var_pc[::-1])
|
||||||
|
_tukeylambda_var_q = poly1d(_tukeylambda_var_qc[::-1])
|
||||||
|
|
||||||
|
|
||||||
|
def tukeylambda_variance(lam):
|
||||||
|
"""Variance of the Tukey Lambda distribution.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
lam : array_like
|
||||||
|
The lambda values at which to compute the variance.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
v : ndarray
|
||||||
|
The variance. For lam < -0.5, the variance is not defined, so
|
||||||
|
np.nan is returned. For lam = 0.5, np.inf is returned.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
In an interval around lambda=0, this function uses the [4,4] Pade
|
||||||
|
approximation to compute the variance. Otherwise it uses the standard
|
||||||
|
formula (http://en.wikipedia.org/wiki/Tukey_lambda_distribution). The
|
||||||
|
Pade approximation is used because the standard formula has a removable
|
||||||
|
discontinuity at lambda = 0, and does not produce accurate numerical
|
||||||
|
results near lambda = 0.
|
||||||
|
"""
|
||||||
|
lam = np.asarray(lam)
|
||||||
|
shp = lam.shape
|
||||||
|
lam = np.atleast_1d(lam).astype(np.float64)
|
||||||
|
|
||||||
|
# For absolute values of lam less than threshold, use the Pade
|
||||||
|
# approximation.
|
||||||
|
threshold = 0.075
|
||||||
|
|
||||||
|
# Play games with masks to implement the conditional evaluation of
|
||||||
|
# the distribution.
|
||||||
|
# lambda < -0.5: var = nan
|
||||||
|
low_mask = lam < -0.5
|
||||||
|
# lambda == -0.5: var = inf
|
||||||
|
neghalf_mask = lam == -0.5
|
||||||
|
# abs(lambda) < threshold: use Pade approximation
|
||||||
|
small_mask = np.abs(lam) < threshold
|
||||||
|
# else the "regular" case: use the explicit formula.
|
||||||
|
reg_mask = ~(low_mask | neghalf_mask | small_mask)
|
||||||
|
|
||||||
|
# Get the 'lam' values for the cases where they are needed.
|
||||||
|
small = lam[small_mask]
|
||||||
|
reg = lam[reg_mask]
|
||||||
|
|
||||||
|
# Compute the function for each case.
|
||||||
|
v = np.empty_like(lam)
|
||||||
|
v[low_mask] = np.nan
|
||||||
|
v[neghalf_mask] = np.inf
|
||||||
|
if small.size > 0:
|
||||||
|
# Use the Pade approximation near lambda = 0.
|
||||||
|
v[small_mask] = _tukeylambda_var_p(small) / _tukeylambda_var_q(small)
|
||||||
|
if reg.size > 0:
|
||||||
|
v[reg_mask] = (2.0 / reg**2) * (1.0 / (1.0 + 2 * reg) -
|
||||||
|
beta(reg + 1, reg + 1))
|
||||||
|
v.shape = shp
|
||||||
|
return v
|
||||||
|
|
||||||
|
|
||||||
|
# The following code was used to generate the Pade coefficients for the
|
||||||
|
# Tukey Lambda kurtosis function. Version 0.17 of mpmath was used.
|
||||||
|
#---------------------------------------------------------------------------
|
||||||
|
# import mpmath as mp
|
||||||
|
#
|
||||||
|
# mp.mp.dps = 60
|
||||||
|
#
|
||||||
|
# one = mp.mpf(1)
|
||||||
|
# two = mp.mpf(2)
|
||||||
|
# three = mp.mpf(3)
|
||||||
|
# four = mp.mpf(4)
|
||||||
|
#
|
||||||
|
# def mpkurt(lam):
|
||||||
|
# if lam == 0:
|
||||||
|
# k = mp.mpf(6)/5
|
||||||
|
# else:
|
||||||
|
# numer = (one/(four*lam+one) - four*mp.beta(three*lam+one, lam+one) +
|
||||||
|
# three*mp.beta(two*lam+one, two*lam+one))
|
||||||
|
# denom = two*(one/(two*lam+one) - mp.beta(lam+one,lam+one))**2
|
||||||
|
# k = numer / denom - three
|
||||||
|
# return k
|
||||||
|
#
|
||||||
|
# # There is a bug in mpmath 0.17: when we use the 'method' keyword of the
|
||||||
|
# # taylor function and we request a degree 9 Taylor polynomial, we actually
|
||||||
|
# # get degree 8.
|
||||||
|
# t = mp.taylor(mpkurt, 0, 9, method='quad', radius=0.01)
|
||||||
|
# t = [mp.chop(c, tol=1e-15) for c in t]
|
||||||
|
# p, q = mp.pade(t, 4, 4)
|
||||||
|
# print "p =", [mp.fp.mpf(c) for c in p]
|
||||||
|
# print "q =", [mp.fp.mpf(c) for c in q]
|
||||||
|
#---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Pade coefficients for the Tukey Lambda kurtosis function.
|
||||||
|
_tukeylambda_kurt_pc = [1.2, -5.853465139719495, -22.653447381131077,
|
||||||
|
0.20601184383406815, 4.59796302262789]
|
||||||
|
_tukeylambda_kurt_qc = [1.0, 7.171149192233599, 12.96663094361842,
|
||||||
|
0.43075235247853005, -2.789746758009912]
|
||||||
|
|
||||||
|
# numpy.poly1d instances for the numerator and denominator of the
|
||||||
|
# Pade approximation to the Tukey Lambda kurtosis.
|
||||||
|
_tukeylambda_kurt_p = poly1d(_tukeylambda_kurt_pc[::-1])
|
||||||
|
_tukeylambda_kurt_q = poly1d(_tukeylambda_kurt_qc[::-1])
|
||||||
|
|
||||||
|
|
||||||
|
def tukeylambda_kurtosis(lam):
|
||||||
|
"""Kurtosis of the Tukey Lambda distribution.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
lam : array_like
|
||||||
|
The lambda values at which to compute the variance.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
v : ndarray
|
||||||
|
The variance. For lam < -0.25, the variance is not defined, so
|
||||||
|
np.nan is returned. For lam = 0.25, np.inf is returned.
|
||||||
|
|
||||||
|
"""
|
||||||
|
lam = np.asarray(lam)
|
||||||
|
shp = lam.shape
|
||||||
|
lam = np.atleast_1d(lam).astype(np.float64)
|
||||||
|
|
||||||
|
# For absolute values of lam less than threshold, use the Pade
|
||||||
|
# approximation.
|
||||||
|
threshold = 0.055
|
||||||
|
|
||||||
|
# Use masks to implement the conditional evaluation of the kurtosis.
|
||||||
|
# lambda < -0.25: kurtosis = nan
|
||||||
|
low_mask = lam < -0.25
|
||||||
|
# lambda == -0.25: kurtosis = inf
|
||||||
|
negqrtr_mask = lam == -0.25
|
||||||
|
# lambda near 0: use Pade approximation
|
||||||
|
small_mask = np.abs(lam) < threshold
|
||||||
|
# else the "regular" case: use the explicit formula.
|
||||||
|
reg_mask = ~(low_mask | negqrtr_mask | small_mask)
|
||||||
|
|
||||||
|
# Get the 'lam' values for the cases where they are needed.
|
||||||
|
small = lam[small_mask]
|
||||||
|
reg = lam[reg_mask]
|
||||||
|
|
||||||
|
# Compute the function for each case.
|
||||||
|
k = np.empty_like(lam)
|
||||||
|
k[low_mask] = np.nan
|
||||||
|
k[negqrtr_mask] = np.inf
|
||||||
|
if small.size > 0:
|
||||||
|
k[small_mask] = _tukeylambda_kurt_p(small) / _tukeylambda_kurt_q(small)
|
||||||
|
if reg.size > 0:
|
||||||
|
numer = (1.0 / (4 * reg + 1) - 4 * beta(3 * reg + 1, reg + 1) +
|
||||||
|
3 * beta(2 * reg + 1, 2 * reg + 1))
|
||||||
|
denom = 2 * (1.0/(2 * reg + 1) - beta(reg + 1, reg + 1))**2
|
||||||
|
k[reg_mask] = numer / denom - 3
|
||||||
|
|
||||||
|
# The return value will be a numpy array; resetting the shape ensures that
|
||||||
|
# if `lam` was a scalar, the return value is a 0-d array.
|
||||||
|
k.shape = shp
|
||||||
|
return k
|
@ -0,0 +1,271 @@
|
|||||||
|
"""Some functions for working with contingency tables (i.e. cross tabulations).
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
from functools import reduce
|
||||||
|
import numpy as np
|
||||||
|
from .stats import power_divergence
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ['margins', 'expected_freq', 'chi2_contingency']
|
||||||
|
|
||||||
|
|
||||||
|
def margins(a):
|
||||||
|
"""Return a list of the marginal sums of the array `a`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
a : ndarray
|
||||||
|
The array for which to compute the marginal sums.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
margsums : list of ndarrays
|
||||||
|
A list of length `a.ndim`. `margsums[k]` is the result
|
||||||
|
of summing `a` over all axes except `k`; it has the same
|
||||||
|
number of dimensions as `a`, but the length of each axis
|
||||||
|
except axis `k` will be 1.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> a = np.arange(12).reshape(2, 6)
|
||||||
|
>>> a
|
||||||
|
array([[ 0, 1, 2, 3, 4, 5],
|
||||||
|
[ 6, 7, 8, 9, 10, 11]])
|
||||||
|
>>> m0, m1 = margins(a)
|
||||||
|
>>> m0
|
||||||
|
array([[15],
|
||||||
|
[51]])
|
||||||
|
>>> m1
|
||||||
|
array([[ 6, 8, 10, 12, 14, 16]])
|
||||||
|
|
||||||
|
>>> b = np.arange(24).reshape(2,3,4)
|
||||||
|
>>> m0, m1, m2 = margins(b)
|
||||||
|
>>> m0
|
||||||
|
array([[[ 66]],
|
||||||
|
[[210]]])
|
||||||
|
>>> m1
|
||||||
|
array([[[ 60],
|
||||||
|
[ 92],
|
||||||
|
[124]]])
|
||||||
|
>>> m2
|
||||||
|
array([[[60, 66, 72, 78]]])
|
||||||
|
"""
|
||||||
|
margsums = []
|
||||||
|
ranged = list(range(a.ndim))
|
||||||
|
for k in ranged:
|
||||||
|
marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
|
||||||
|
margsums.append(marg)
|
||||||
|
return margsums
|
||||||
|
|
||||||
|
|
||||||
|
def expected_freq(observed):
|
||||||
|
"""
|
||||||
|
Compute the expected frequencies from a contingency table.
|
||||||
|
|
||||||
|
Given an n-dimensional contingency table of observed frequencies,
|
||||||
|
compute the expected frequencies for the table based on the marginal
|
||||||
|
sums under the assumption that the groups associated with each
|
||||||
|
dimension are independent.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
observed : array_like
|
||||||
|
The table of observed frequencies. (While this function can handle
|
||||||
|
a 1-D array, that case is trivial. Generally `observed` is at
|
||||||
|
least 2-D.)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
expected : ndarray of float64
|
||||||
|
The expected frequencies, based on the marginal sums of the table.
|
||||||
|
Same shape as `observed`.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> observed = np.array([[10, 10, 20],[20, 20, 20]])
|
||||||
|
>>> expected_freq(observed)
|
||||||
|
array([[ 12., 12., 16.],
|
||||||
|
[ 18., 18., 24.]])
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Typically `observed` is an integer array. If `observed` has a large
|
||||||
|
# number of dimensions or holds large values, some of the following
|
||||||
|
# computations may overflow, so we first switch to floating point.
|
||||||
|
observed = np.asarray(observed, dtype=np.float64)
|
||||||
|
|
||||||
|
# Create a list of the marginal sums.
|
||||||
|
margsums = margins(observed)
|
||||||
|
|
||||||
|
# Create the array of expected frequencies. The shapes of the
|
||||||
|
# marginal sums returned by apply_over_axes() are just what we
|
||||||
|
# need for broadcasting in the following product.
|
||||||
|
d = observed.ndim
|
||||||
|
expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
|
||||||
|
return expected
|
||||||
|
|
||||||
|
|
||||||
|
def chi2_contingency(observed, correction=True, lambda_=None):
|
||||||
|
"""Chi-square test of independence of variables in a contingency table.
|
||||||
|
|
||||||
|
This function computes the chi-square statistic and p-value for the
|
||||||
|
hypothesis test of independence of the observed frequencies in the
|
||||||
|
contingency table [1]_ `observed`. The expected frequencies are computed
|
||||||
|
based on the marginal sums under the assumption of independence; see
|
||||||
|
`scipy.stats.contingency.expected_freq`. The number of degrees of
|
||||||
|
freedom is (expressed using numpy functions and attributes)::
|
||||||
|
|
||||||
|
dof = observed.size - sum(observed.shape) + observed.ndim - 1
|
||||||
|
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
observed : array_like
|
||||||
|
The contingency table. The table contains the observed frequencies
|
||||||
|
(i.e. number of occurrences) in each category. In the two-dimensional
|
||||||
|
case, the table is often described as an "R x C table".
|
||||||
|
correction : bool, optional
|
||||||
|
If True, *and* the degrees of freedom is 1, apply Yates' correction
|
||||||
|
for continuity. The effect of the correction is to adjust each
|
||||||
|
observed value by 0.5 towards the corresponding expected value.
|
||||||
|
lambda_ : float or str, optional.
|
||||||
|
By default, the statistic computed in this test is Pearson's
|
||||||
|
chi-squared statistic [2]_. `lambda_` allows a statistic from the
|
||||||
|
Cressie-Read power divergence family [3]_ to be used instead. See
|
||||||
|
`power_divergence` for details.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
chi2 : float
|
||||||
|
The test statistic.
|
||||||
|
p : float
|
||||||
|
The p-value of the test
|
||||||
|
dof : int
|
||||||
|
Degrees of freedom
|
||||||
|
expected : ndarray, same shape as `observed`
|
||||||
|
The expected frequencies, based on the marginal sums of the table.
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
contingency.expected_freq
|
||||||
|
fisher_exact
|
||||||
|
chisquare
|
||||||
|
power_divergence
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
An often quoted guideline for the validity of this calculation is that
|
||||||
|
the test should be used only if the observed and expected frequency in
|
||||||
|
each cell is at least 5.
|
||||||
|
|
||||||
|
This is a test for the independence of different categories of a
|
||||||
|
population. The test is only meaningful when the dimension of
|
||||||
|
`observed` is two or more. Applying the test to a one-dimensional
|
||||||
|
table will always result in `expected` equal to `observed` and a
|
||||||
|
chi-square statistic equal to 0.
|
||||||
|
|
||||||
|
This function does not handle masked arrays, because the calculation
|
||||||
|
does not make sense with missing values.
|
||||||
|
|
||||||
|
Like stats.chisquare, this function computes a chi-square statistic;
|
||||||
|
the convenience this function provides is to figure out the expected
|
||||||
|
frequencies and degrees of freedom from the given contingency table.
|
||||||
|
If these were already known, and if the Yates' correction was not
|
||||||
|
required, one could use stats.chisquare. That is, if one calls::
|
||||||
|
|
||||||
|
chi2, p, dof, ex = chi2_contingency(obs, correction=False)
|
||||||
|
|
||||||
|
then the following is true::
|
||||||
|
|
||||||
|
(chi2, p) == stats.chisquare(obs.ravel(), f_exp=ex.ravel(),
|
||||||
|
ddof=obs.size - 1 - dof)
|
||||||
|
|
||||||
|
The `lambda_` argument was added in version 0.13.0 of scipy.
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
.. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
|
||||||
|
.. [2] "Pearson's chi-squared test",
|
||||||
|
http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
|
||||||
|
.. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
|
||||||
|
Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
|
||||||
|
pp. 440-464.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
A two-way example (2 x 3):
|
||||||
|
|
||||||
|
>>> obs = np.array([[10, 10, 20], [20, 20, 20]])
|
||||||
|
>>> chi2_contingency(obs)
|
||||||
|
(2.7777777777777777,
|
||||||
|
0.24935220877729619,
|
||||||
|
2,
|
||||||
|
array([[ 12., 12., 16.],
|
||||||
|
[ 18., 18., 24.]]))
|
||||||
|
|
||||||
|
Perform the test using the log-likelihood ratio (i.e. the "G-test")
|
||||||
|
instead of Pearson's chi-squared statistic.
|
||||||
|
|
||||||
|
>>> g, p, dof, expctd = chi2_contingency(obs, lambda_="log-likelihood")
|
||||||
|
>>> g, p
|
||||||
|
(2.7688587616781319, 0.25046668010954165)
|
||||||
|
|
||||||
|
A four-way example (2 x 2 x 2 x 2):
|
||||||
|
|
||||||
|
>>> obs = np.array(
|
||||||
|
... [[[[12, 17],
|
||||||
|
... [11, 16]],
|
||||||
|
... [[11, 12],
|
||||||
|
... [15, 16]]],
|
||||||
|
... [[[23, 15],
|
||||||
|
... [30, 22]],
|
||||||
|
... [[14, 17],
|
||||||
|
... [15, 16]]]])
|
||||||
|
>>> chi2_contingency(obs)
|
||||||
|
(8.7584514426741897,
|
||||||
|
0.64417725029295503,
|
||||||
|
11,
|
||||||
|
array([[[[ 14.15462386, 14.15462386],
|
||||||
|
[ 16.49423111, 16.49423111]],
|
||||||
|
[[ 11.2461395 , 11.2461395 ],
|
||||||
|
[ 13.10500554, 13.10500554]]],
|
||||||
|
[[[ 19.5591166 , 19.5591166 ],
|
||||||
|
[ 22.79202844, 22.79202844]],
|
||||||
|
[[ 15.54012004, 15.54012004],
|
||||||
|
[ 18.10873492, 18.10873492]]]]))
|
||||||
|
"""
|
||||||
|
observed = np.asarray(observed)
|
||||||
|
if np.any(observed < 0):
|
||||||
|
raise ValueError("All values in `observed` must be nonnegative.")
|
||||||
|
if observed.size == 0:
|
||||||
|
raise ValueError("No data; `observed` has size 0.")
|
||||||
|
|
||||||
|
expected = expected_freq(observed)
|
||||||
|
if np.any(expected == 0):
|
||||||
|
# Include one of the positions where expected is zero in
|
||||||
|
# the exception message.
|
||||||
|
zeropos = list(np.where(expected == 0)[0])
|
||||||
|
raise ValueError("The internally computed table of expected "
|
||||||
|
"frequencies has a zero element at %s." % zeropos)
|
||||||
|
|
||||||
|
# The degrees of freedom
|
||||||
|
dof = expected.size - sum(expected.shape) + expected.ndim - 1
|
||||||
|
|
||||||
|
if dof == 0:
|
||||||
|
# Degenerate case; this occurs when `observed` is 1D (or, more
|
||||||
|
# generally, when it has only one nontrivial dimension). In this
|
||||||
|
# case, we also have observed == expected, so chi2 is 0.
|
||||||
|
chi2 = 0.0
|
||||||
|
p = 1.0
|
||||||
|
else:
|
||||||
|
if dof == 1 and correction:
|
||||||
|
# Adjust `observed` according to Yates' correction for continuity.
|
||||||
|
observed = observed + 0.5 * np.sign(expected - observed)
|
||||||
|
|
||||||
|
chi2, p = power_divergence(observed, expected,
|
||||||
|
ddof=observed.size - 1 - dof, axis=None,
|
||||||
|
lambda_=lambda_)
|
||||||
|
|
||||||
|
return chi2, p, dof, expected
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,513 @@
|
|||||||
|
#-------------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Define classes for (uni/multi)-variate kernel density estimation.
|
||||||
|
#
|
||||||
|
# Currently, only Gaussian kernels are implemented.
|
||||||
|
#
|
||||||
|
# Written by: Robert Kern
|
||||||
|
#
|
||||||
|
# Date: 2004-08-09
|
||||||
|
#
|
||||||
|
# Modified: 2005-02-10 by Robert Kern.
|
||||||
|
# Contributed to Scipy
|
||||||
|
# 2005-10-07 by Robert Kern.
|
||||||
|
# Some fixes to match the new scipy_core
|
||||||
|
#
|
||||||
|
# Copyright 2004-2005 by Enthought, Inc.
|
||||||
|
#
|
||||||
|
#-------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
# Standard library imports.
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
# Scipy imports.
|
||||||
|
from scipy.lib.six import callable, string_types
|
||||||
|
from scipy import linalg, special
|
||||||
|
|
||||||
|
from numpy import atleast_2d, reshape, zeros, newaxis, dot, exp, pi, sqrt, \
|
||||||
|
ravel, power, atleast_1d, squeeze, sum, transpose
|
||||||
|
import numpy as np
|
||||||
|
from numpy.random import randint, multivariate_normal
|
||||||
|
|
||||||
|
# Local imports.
|
||||||
|
from . import mvn
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ['gaussian_kde']
|
||||||
|
|
||||||
|
|
||||||
|
class gaussian_kde(object):
|
||||||
|
"""Representation of a kernel-density estimate using Gaussian kernels.
|
||||||
|
|
||||||
|
Kernel density estimation is a way to estimate the probability density
|
||||||
|
function (PDF) of a random variable in a non-parametric way.
|
||||||
|
`gaussian_kde` works for both uni-variate and multi-variate data. It
|
||||||
|
includes automatic bandwidth determination. The estimation works best for
|
||||||
|
a unimodal distribution; bimodal or multi-modal distributions tend to be
|
||||||
|
oversmoothed.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : array_like
|
||||||
|
Datapoints to estimate from. In case of univariate data this is a 1-D
|
||||||
|
array, otherwise a 2-D array with shape (# of dims, # of data).
|
||||||
|
bw_method : str, scalar or callable, optional
|
||||||
|
The method used to calculate the estimator bandwidth. This can be
|
||||||
|
'scott', 'silverman', a scalar constant or a callable. If a scalar,
|
||||||
|
this will be used directly as `kde.factor`. If a callable, it should
|
||||||
|
take a `gaussian_kde` instance as only parameter and return a scalar.
|
||||||
|
If None (default), 'scott' is used. See Notes for more details.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
dataset : ndarray
|
||||||
|
The dataset with which `gaussian_kde` was initialized.
|
||||||
|
d : int
|
||||||
|
Number of dimensions.
|
||||||
|
n : int
|
||||||
|
Number of datapoints.
|
||||||
|
factor : float
|
||||||
|
The bandwidth factor, obtained from `kde.covariance_factor`, with which
|
||||||
|
the covariance matrix is multiplied.
|
||||||
|
covariance : ndarray
|
||||||
|
The covariance matrix of `dataset`, scaled by the calculated bandwidth
|
||||||
|
(`kde.factor`).
|
||||||
|
inv_cov : ndarray
|
||||||
|
The inverse of `covariance`.
|
||||||
|
|
||||||
|
Methods
|
||||||
|
-------
|
||||||
|
kde.evaluate(points) : ndarray
|
||||||
|
Evaluate the estimated pdf on a provided set of points.
|
||||||
|
kde(points) : ndarray
|
||||||
|
Same as kde.evaluate(points)
|
||||||
|
kde.integrate_gaussian(mean, cov) : float
|
||||||
|
Multiply pdf with a specified Gaussian and integrate over the whole
|
||||||
|
domain.
|
||||||
|
kde.integrate_box_1d(low, high) : float
|
||||||
|
Integrate pdf (1D only) between two bounds.
|
||||||
|
kde.integrate_box(low_bounds, high_bounds) : float
|
||||||
|
Integrate pdf over a rectangular space between low_bounds and
|
||||||
|
high_bounds.
|
||||||
|
kde.integrate_kde(other_kde) : float
|
||||||
|
Integrate two kernel density estimates multiplied together.
|
||||||
|
kde.resample(size=None) : ndarray
|
||||||
|
Randomly sample a dataset from the estimated pdf.
|
||||||
|
kde.set_bandwidth(bw_method='scott') : None
|
||||||
|
Computes the bandwidth, i.e. the coefficient that multiplies the data
|
||||||
|
covariance matrix to obtain the kernel covariance matrix.
|
||||||
|
.. versionadded:: 0.11.0
|
||||||
|
kde.covariance_factor : float
|
||||||
|
Computes the coefficient (`kde.factor`) that multiplies the data
|
||||||
|
covariance matrix to obtain the kernel covariance matrix.
|
||||||
|
The default is `scotts_factor`. A subclass can overwrite this method
|
||||||
|
to provide a different method, or set it through a call to
|
||||||
|
`kde.set_bandwidth`.
|
||||||
|
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
Bandwidth selection strongly influences the estimate obtained from the KDE
|
||||||
|
(much more so than the actual shape of the kernel). Bandwidth selection
|
||||||
|
can be done by a "rule of thumb", by cross-validation, by "plug-in
|
||||||
|
methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
|
||||||
|
uses a rule of thumb, the default is Scott's Rule.
|
||||||
|
|
||||||
|
Scott's Rule [1]_, implemented as `scotts_factor`, is::
|
||||||
|
|
||||||
|
n**(-1./(d+4)),
|
||||||
|
|
||||||
|
with ``n`` the number of data points and ``d`` the number of dimensions.
|
||||||
|
Silverman's Rule [2]_, implemented as `silverman_factor`, is::
|
||||||
|
|
||||||
|
n * (d + 2) / 4.)**(-1. / (d + 4)).
|
||||||
|
|
||||||
|
Good general descriptions of kernel density estimation can be found in [1]_
|
||||||
|
and [2]_, the mathematics for this multi-dimensional implementation can be
|
||||||
|
found in [1]_.
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
.. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
|
||||||
|
Visualization", John Wiley & Sons, New York, Chicester, 1992.
|
||||||
|
.. [2] B.W. Silverman, "Density Estimation for Statistics and Data
|
||||||
|
Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
|
||||||
|
Chapman and Hall, London, 1986.
|
||||||
|
.. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
|
||||||
|
Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
|
||||||
|
.. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
|
||||||
|
conditional density estimation", Computational Statistics & Data
|
||||||
|
Analysis, Vol. 36, pp. 279-298, 2001.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
Generate some random two-dimensional data:
|
||||||
|
|
||||||
|
>>> from scipy import stats
|
||||||
|
>>> def measure(n):
|
||||||
|
>>> "Measurement model, return two coupled measurements."
|
||||||
|
>>> m1 = np.random.normal(size=n)
|
||||||
|
>>> m2 = np.random.normal(scale=0.5, size=n)
|
||||||
|
>>> return m1+m2, m1-m2
|
||||||
|
|
||||||
|
>>> m1, m2 = measure(2000)
|
||||||
|
>>> xmin = m1.min()
|
||||||
|
>>> xmax = m1.max()
|
||||||
|
>>> ymin = m2.min()
|
||||||
|
>>> ymax = m2.max()
|
||||||
|
|
||||||
|
Perform a kernel density estimate on the data:
|
||||||
|
|
||||||
|
>>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
|
||||||
|
>>> positions = np.vstack([X.ravel(), Y.ravel()])
|
||||||
|
>>> values = np.vstack([m1, m2])
|
||||||
|
>>> kernel = stats.gaussian_kde(values)
|
||||||
|
>>> Z = np.reshape(kernel(positions).T, X.shape)
|
||||||
|
|
||||||
|
Plot the results:
|
||||||
|
|
||||||
|
>>> import matplotlib.pyplot as plt
|
||||||
|
>>> fig = plt.figure()
|
||||||
|
>>> ax = fig.add_subplot(111)
|
||||||
|
>>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
|
||||||
|
... extent=[xmin, xmax, ymin, ymax])
|
||||||
|
>>> ax.plot(m1, m2, 'k.', markersize=2)
|
||||||
|
>>> ax.set_xlim([xmin, xmax])
|
||||||
|
>>> ax.set_ylim([ymin, ymax])
|
||||||
|
>>> plt.show()
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, dataset, bw_method=None):
|
||||||
|
self.dataset = atleast_2d(dataset)
|
||||||
|
if not self.dataset.size > 1:
|
||||||
|
raise ValueError("`dataset` input should have multiple elements.")
|
||||||
|
|
||||||
|
self.d, self.n = self.dataset.shape
|
||||||
|
self.set_bandwidth(bw_method=bw_method)
|
||||||
|
|
||||||
|
def evaluate(self, points):
|
||||||
|
"""Evaluate the estimated pdf on a set of points.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
points : (# of dimensions, # of points)-array
|
||||||
|
Alternatively, a (# of dimensions,) vector can be passed in and
|
||||||
|
treated as a single point.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
values : (# of points,)-array
|
||||||
|
The values at each point.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError : if the dimensionality of the input points is different than
|
||||||
|
the dimensionality of the KDE.
|
||||||
|
|
||||||
|
"""
|
||||||
|
points = atleast_2d(points)
|
||||||
|
|
||||||
|
d, m = points.shape
|
||||||
|
if d != self.d:
|
||||||
|
if d == 1 and m == self.d:
|
||||||
|
# points was passed in as a row vector
|
||||||
|
points = reshape(points, (self.d, 1))
|
||||||
|
m = 1
|
||||||
|
else:
|
||||||
|
msg = "points have dimension %s, dataset has dimension %s" % (d,
|
||||||
|
self.d)
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
result = zeros((m,), dtype=np.float)
|
||||||
|
|
||||||
|
if m >= self.n:
|
||||||
|
# there are more points than data, so loop over data
|
||||||
|
for i in range(self.n):
|
||||||
|
diff = self.dataset[:, i, newaxis] - points
|
||||||
|
tdiff = dot(self.inv_cov, diff)
|
||||||
|
energy = sum(diff*tdiff,axis=0) / 2.0
|
||||||
|
result = result + exp(-energy)
|
||||||
|
else:
|
||||||
|
# loop over points
|
||||||
|
for i in range(m):
|
||||||
|
diff = self.dataset - points[:, i, newaxis]
|
||||||
|
tdiff = dot(self.inv_cov, diff)
|
||||||
|
energy = sum(diff * tdiff, axis=0) / 2.0
|
||||||
|
result[i] = sum(exp(-energy), axis=0)
|
||||||
|
|
||||||
|
result = result / self._norm_factor
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
__call__ = evaluate
|
||||||
|
|
||||||
|
def integrate_gaussian(self, mean, cov):
|
||||||
|
"""
|
||||||
|
Multiply estimated density by a multivariate Gaussian and integrate
|
||||||
|
over the whole space.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mean : aray_like
|
||||||
|
A 1-D array, specifying the mean of the Gaussian.
|
||||||
|
cov : array_like
|
||||||
|
A 2-D array, specifying the covariance matrix of the Gaussian.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
result : scalar
|
||||||
|
The value of the integral.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError :
|
||||||
|
If the mean or covariance of the input Gaussian differs from
|
||||||
|
the KDE's dimensionality.
|
||||||
|
|
||||||
|
"""
|
||||||
|
mean = atleast_1d(squeeze(mean))
|
||||||
|
cov = atleast_2d(cov)
|
||||||
|
|
||||||
|
if mean.shape != (self.d,):
|
||||||
|
raise ValueError("mean does not have dimension %s" % self.d)
|
||||||
|
if cov.shape != (self.d, self.d):
|
||||||
|
raise ValueError("covariance does not have dimension %s" % self.d)
|
||||||
|
|
||||||
|
# make mean a column vector
|
||||||
|
mean = mean[:, newaxis]
|
||||||
|
|
||||||
|
sum_cov = self.covariance + cov
|
||||||
|
|
||||||
|
diff = self.dataset - mean
|
||||||
|
tdiff = dot(linalg.inv(sum_cov), diff)
|
||||||
|
|
||||||
|
energies = sum(diff * tdiff, axis=0) / 2.0
|
||||||
|
result = sum(exp(-energies), axis=0) / sqrt(linalg.det(2 * pi *
|
||||||
|
sum_cov)) / self.n
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def integrate_box_1d(self, low, high):
|
||||||
|
"""
|
||||||
|
Computes the integral of a 1D pdf between two bounds.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
low : scalar
|
||||||
|
Lower bound of integration.
|
||||||
|
high : scalar
|
||||||
|
Upper bound of integration.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
value : scalar
|
||||||
|
The result of the integral.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If the KDE is over more than one dimension.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if self.d != 1:
|
||||||
|
raise ValueError("integrate_box_1d() only handles 1D pdfs")
|
||||||
|
|
||||||
|
stdev = ravel(sqrt(self.covariance))[0]
|
||||||
|
|
||||||
|
normalized_low = ravel((low - self.dataset) / stdev)
|
||||||
|
normalized_high = ravel((high - self.dataset) / stdev)
|
||||||
|
|
||||||
|
value = np.mean(special.ndtr(normalized_high) -
|
||||||
|
special.ndtr(normalized_low))
|
||||||
|
return value
|
||||||
|
|
||||||
|
def integrate_box(self, low_bounds, high_bounds, maxpts=None):
|
||||||
|
"""Computes the integral of a pdf over a rectangular interval.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
low_bounds : array_like
|
||||||
|
A 1-D array containing the lower bounds of integration.
|
||||||
|
high_bounds : array_like
|
||||||
|
A 1-D array containing the upper bounds of integration.
|
||||||
|
maxpts : int, optional
|
||||||
|
The maximum number of points to use for integration.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
value : scalar
|
||||||
|
The result of the integral.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if maxpts is not None:
|
||||||
|
extra_kwds = {'maxpts': maxpts}
|
||||||
|
else:
|
||||||
|
extra_kwds = {}
|
||||||
|
|
||||||
|
value, inform = mvn.mvnun(low_bounds, high_bounds, self.dataset,
|
||||||
|
self.covariance, **extra_kwds)
|
||||||
|
if inform:
|
||||||
|
msg = ('An integral in mvn.mvnun requires more points than %s' %
|
||||||
|
(self.d * 1000))
|
||||||
|
warnings.warn(msg)
|
||||||
|
|
||||||
|
return value
|
||||||
|
|
||||||
|
def integrate_kde(self, other):
|
||||||
|
"""
|
||||||
|
Computes the integral of the product of this kernel density estimate
|
||||||
|
with another.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
other : gaussian_kde instance
|
||||||
|
The other kde.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
value : scalar
|
||||||
|
The result of the integral.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If the KDEs have different dimensionality.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if other.d != self.d:
|
||||||
|
raise ValueError("KDEs are not the same dimensionality")
|
||||||
|
|
||||||
|
# we want to iterate over the smallest number of points
|
||||||
|
if other.n < self.n:
|
||||||
|
small = other
|
||||||
|
large = self
|
||||||
|
else:
|
||||||
|
small = self
|
||||||
|
large = other
|
||||||
|
|
||||||
|
sum_cov = small.covariance + large.covariance
|
||||||
|
result = 0.0
|
||||||
|
for i in range(small.n):
|
||||||
|
mean = small.dataset[:, i, newaxis]
|
||||||
|
diff = large.dataset - mean
|
||||||
|
tdiff = dot(linalg.inv(sum_cov), diff)
|
||||||
|
|
||||||
|
energies = sum(diff * tdiff, axis=0) / 2.0
|
||||||
|
result += sum(exp(-energies), axis=0)
|
||||||
|
|
||||||
|
result /= sqrt(linalg.det(2 * pi * sum_cov)) * large.n * small.n
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def resample(self, size=None):
|
||||||
|
"""
|
||||||
|
Randomly sample a dataset from the estimated pdf.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
size : int, optional
|
||||||
|
The number of samples to draw. If not provided, then the size is
|
||||||
|
the same as the underlying dataset.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
resample : (self.d, `size`) ndarray
|
||||||
|
The sampled dataset.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if size is None:
|
||||||
|
size = self.n
|
||||||
|
|
||||||
|
norm = transpose(multivariate_normal(zeros((self.d,), float),
|
||||||
|
self.covariance, size=size))
|
||||||
|
indices = randint(0, self.n, size=size)
|
||||||
|
means = self.dataset[:, indices]
|
||||||
|
|
||||||
|
return means + norm
|
||||||
|
|
||||||
|
def scotts_factor(self):
|
||||||
|
return power(self.n, -1./(self.d+4))
|
||||||
|
|
||||||
|
def silverman_factor(self):
|
||||||
|
return power(self.n*(self.d+2.0)/4.0, -1./(self.d+4))
|
||||||
|
|
||||||
|
# Default method to calculate bandwidth, can be overwritten by subclass
|
||||||
|
covariance_factor = scotts_factor
|
||||||
|
|
||||||
|
def set_bandwidth(self, bw_method=None):
|
||||||
|
"""Compute the estimator bandwidth with given method.
|
||||||
|
|
||||||
|
The new bandwidth calculated after a call to `set_bandwidth` is used
|
||||||
|
for subsequent evaluations of the estimated density.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
bw_method : str, scalar or callable, optional
|
||||||
|
The method used to calculate the estimator bandwidth. This can be
|
||||||
|
'scott', 'silverman', a scalar constant or a callable. If a
|
||||||
|
scalar, this will be used directly as `kde.factor`. If a callable,
|
||||||
|
it should take a `gaussian_kde` instance as only parameter and
|
||||||
|
return a scalar. If None (default), nothing happens; the current
|
||||||
|
`kde.covariance_factor` method is kept.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
.. versionadded:: 0.11
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> x1 = np.array([-7, -5, 1, 4, 5.])
|
||||||
|
>>> kde = stats.gaussian_kde(x1)
|
||||||
|
>>> xs = np.linspace(-10, 10, num=50)
|
||||||
|
>>> y1 = kde(xs)
|
||||||
|
>>> kde.set_bandwidth(bw_method='silverman')
|
||||||
|
>>> y2 = kde(xs)
|
||||||
|
>>> kde.set_bandwidth(bw_method=kde.factor / 3.)
|
||||||
|
>>> y3 = kde(xs)
|
||||||
|
|
||||||
|
>>> fig = plt.figure()
|
||||||
|
>>> ax = fig.add_subplot(111)
|
||||||
|
>>> ax.plot(x1, np.ones(x1.shape) / (4. * x1.size), 'bo',
|
||||||
|
... label='Data points (rescaled)')
|
||||||
|
>>> ax.plot(xs, y1, label='Scott (default)')
|
||||||
|
>>> ax.plot(xs, y2, label='Silverman')
|
||||||
|
>>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
|
||||||
|
>>> ax.legend()
|
||||||
|
>>> plt.show()
|
||||||
|
|
||||||
|
"""
|
||||||
|
if bw_method is None:
|
||||||
|
pass
|
||||||
|
elif bw_method == 'scott':
|
||||||
|
self.covariance_factor = self.scotts_factor
|
||||||
|
elif bw_method == 'silverman':
|
||||||
|
self.covariance_factor = self.silverman_factor
|
||||||
|
elif np.isscalar(bw_method) and not isinstance(bw_method, string_types):
|
||||||
|
self._bw_method = 'use constant'
|
||||||
|
self.covariance_factor = lambda: bw_method
|
||||||
|
elif callable(bw_method):
|
||||||
|
self._bw_method = bw_method
|
||||||
|
self.covariance_factor = lambda: self._bw_method(self)
|
||||||
|
else:
|
||||||
|
msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
|
||||||
|
"or a callable."
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
self._compute_covariance()
|
||||||
|
|
||||||
|
def _compute_covariance(self):
|
||||||
|
"""Computes the covariance matrix for each Gaussian kernel using
|
||||||
|
covariance_factor().
|
||||||
|
"""
|
||||||
|
self.factor = self.covariance_factor()
|
||||||
|
# Cache covariance and inverse covariance of the data
|
||||||
|
if not hasattr(self, '_data_inv_cov'):
|
||||||
|
self._data_covariance = atleast_2d(np.cov(self.dataset, rowvar=1,
|
||||||
|
bias=False))
|
||||||
|
self._data_inv_cov = linalg.inv(self._data_covariance)
|
||||||
|
|
||||||
|
self.covariance = self._data_covariance * self.factor**2
|
||||||
|
self.inv_cov = self._data_inv_cov / self.factor**2
|
||||||
|
self._norm_factor = sqrt(linalg.det(2*pi*self.covariance)) * self.n
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,82 @@
|
|||||||
|
"""
|
||||||
|
===================================================================
|
||||||
|
Statistical functions for masked arrays (:mod:`scipy.stats.mstats`)
|
||||||
|
===================================================================
|
||||||
|
|
||||||
|
.. currentmodule:: scipy.stats.mstats
|
||||||
|
|
||||||
|
This module contains a large number of statistical functions that can
|
||||||
|
be used with masked arrays.
|
||||||
|
|
||||||
|
Most of these functions are similar to those in scipy.stats but might
|
||||||
|
have small differences in the API or in the algorithm used. Since this
|
||||||
|
is a relatively new package, some API changes are still possible.
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: generated/
|
||||||
|
|
||||||
|
argstoarray
|
||||||
|
betai
|
||||||
|
chisquare
|
||||||
|
count_tied_groups
|
||||||
|
describe
|
||||||
|
f_oneway
|
||||||
|
f_value_wilks_lambda
|
||||||
|
find_repeats
|
||||||
|
friedmanchisquare
|
||||||
|
gmean
|
||||||
|
hmean
|
||||||
|
kendalltau
|
||||||
|
kendalltau_seasonal
|
||||||
|
kruskalwallis
|
||||||
|
kruskalwallis
|
||||||
|
ks_twosamp
|
||||||
|
ks_twosamp
|
||||||
|
kurtosis
|
||||||
|
kurtosistest
|
||||||
|
linregress
|
||||||
|
mannwhitneyu
|
||||||
|
plotting_positions
|
||||||
|
mode
|
||||||
|
moment
|
||||||
|
mquantiles
|
||||||
|
msign
|
||||||
|
normaltest
|
||||||
|
obrientransform
|
||||||
|
pearsonr
|
||||||
|
plotting_positions
|
||||||
|
pointbiserialr
|
||||||
|
rankdata
|
||||||
|
scoreatpercentile
|
||||||
|
sem
|
||||||
|
signaltonoise
|
||||||
|
skew
|
||||||
|
skewtest
|
||||||
|
spearmanr
|
||||||
|
theilslopes
|
||||||
|
threshold
|
||||||
|
tmax
|
||||||
|
tmean
|
||||||
|
tmin
|
||||||
|
trim
|
||||||
|
trima
|
||||||
|
trimboth
|
||||||
|
trimmed_stde
|
||||||
|
trimr
|
||||||
|
trimtail
|
||||||
|
tsem
|
||||||
|
ttest_onesamp
|
||||||
|
ttest_ind
|
||||||
|
ttest_onesamp
|
||||||
|
ttest_rel
|
||||||
|
tvar
|
||||||
|
variation
|
||||||
|
winsorize
|
||||||
|
zmap
|
||||||
|
zscore
|
||||||
|
|
||||||
|
"""
|
||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
from .mstats_basic import *
|
||||||
|
from .mstats_extras import *
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,466 @@
|
|||||||
|
"""
|
||||||
|
Additional statistics functions, with support to MA.
|
||||||
|
|
||||||
|
:author: Pierre GF Gerard-Marchant
|
||||||
|
:contact: pierregm_at_uga_edu
|
||||||
|
:date: $Date: 2007-10-29 17:18:13 +0200 (Mon, 29 Oct 2007) $
|
||||||
|
:version: $Id: morestats.py 3473 2007-10-29 15:18:13Z jarrod.millman $
|
||||||
|
"""
|
||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
__author__ = "Pierre GF Gerard-Marchant"
|
||||||
|
__docformat__ = "restructuredtext en"
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ['compare_medians_ms',
|
||||||
|
'hdquantiles', 'hdmedian', 'hdquantiles_sd',
|
||||||
|
'idealfourths',
|
||||||
|
'median_cihs','mjci','mquantiles_cimj',
|
||||||
|
'rsh',
|
||||||
|
'trimmed_mean_ci',]
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from numpy import float_, int_, ndarray
|
||||||
|
|
||||||
|
import numpy.ma as ma
|
||||||
|
from numpy.ma import MaskedArray
|
||||||
|
|
||||||
|
from . import mstats_basic as mstats
|
||||||
|
|
||||||
|
from scipy.stats.distributions import norm, beta, t, binom
|
||||||
|
|
||||||
|
|
||||||
|
#####--------------------------------------------------------------------------
|
||||||
|
#---- --- Quantiles ---
|
||||||
|
#####--------------------------------------------------------------------------
|
||||||
|
def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
|
||||||
|
"""
|
||||||
|
Computes quantile estimates with the Harrell-Davis method.
|
||||||
|
|
||||||
|
The quantile estimates are calculated as a weighted linear combination
|
||||||
|
of order statistics.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : array_like
|
||||||
|
Data array.
|
||||||
|
prob : sequence
|
||||||
|
Sequence of quantiles to compute.
|
||||||
|
axis : int
|
||||||
|
Axis along which to compute the quantiles. If None, use a flattened
|
||||||
|
array.
|
||||||
|
var : boolean
|
||||||
|
Whether to return the variance of the estimate.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
hdquantiles : MaskedArray
|
||||||
|
A (p,) array of quantiles (if `var` is False), or a (2,p) array of
|
||||||
|
quantiles and variances (if `var` is True), where ``p`` is the
|
||||||
|
number of quantiles.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _hd_1D(data,prob,var):
|
||||||
|
"Computes the HD quantiles for a 1D array. Returns nan for invalid data."
|
||||||
|
xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
|
||||||
|
# Don't use length here, in case we have a numpy scalar
|
||||||
|
n = xsorted.size
|
||||||
|
#.........
|
||||||
|
hd = np.empty((2,len(prob)), float_)
|
||||||
|
if n < 2:
|
||||||
|
hd.flat = np.nan
|
||||||
|
if var:
|
||||||
|
return hd
|
||||||
|
return hd[0]
|
||||||
|
#.........
|
||||||
|
v = np.arange(n+1) / float(n)
|
||||||
|
betacdf = beta.cdf
|
||||||
|
for (i,p) in enumerate(prob):
|
||||||
|
_w = betacdf(v, (n+1)*p, (n+1)*(1-p))
|
||||||
|
w = _w[1:] - _w[:-1]
|
||||||
|
hd_mean = np.dot(w, xsorted)
|
||||||
|
hd[0,i] = hd_mean
|
||||||
|
#
|
||||||
|
hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
|
||||||
|
#
|
||||||
|
hd[0, prob == 0] = xsorted[0]
|
||||||
|
hd[0, prob == 1] = xsorted[-1]
|
||||||
|
if var:
|
||||||
|
hd[1, prob == 0] = hd[1, prob == 1] = np.nan
|
||||||
|
return hd
|
||||||
|
return hd[0]
|
||||||
|
# Initialization & checks ---------
|
||||||
|
data = ma.array(data, copy=False, dtype=float_)
|
||||||
|
p = np.array(prob, copy=False, ndmin=1)
|
||||||
|
# Computes quantiles along axis (or globally)
|
||||||
|
if (axis is None) or (data.ndim == 1):
|
||||||
|
result = _hd_1D(data, p, var)
|
||||||
|
else:
|
||||||
|
if data.ndim > 2:
|
||||||
|
raise ValueError("Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim)
|
||||||
|
result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
|
||||||
|
#
|
||||||
|
return ma.fix_invalid(result, copy=False)
|
||||||
|
|
||||||
|
#..............................................................................
|
||||||
|
|
||||||
|
|
||||||
|
def hdmedian(data, axis=-1, var=False):
|
||||||
|
"""
|
||||||
|
Returns the Harrell-Davis estimate of the median along the given axis.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : ndarray
|
||||||
|
Data array.
|
||||||
|
axis : int
|
||||||
|
Axis along which to compute the quantiles. If None, use a flattened
|
||||||
|
array.
|
||||||
|
var : boolean
|
||||||
|
Whether to return the variance of the estimate.
|
||||||
|
|
||||||
|
"""
|
||||||
|
result = hdquantiles(data,[0.5], axis=axis, var=var)
|
||||||
|
return result.squeeze()
|
||||||
|
|
||||||
|
|
||||||
|
#..............................................................................
|
||||||
|
def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
|
||||||
|
"""
|
||||||
|
The standard error of the Harrell-Davis quantile estimates by jackknife.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : array_like
|
||||||
|
Data array.
|
||||||
|
prob : sequence
|
||||||
|
Sequence of quantiles to compute.
|
||||||
|
axis : int
|
||||||
|
Axis along which to compute the quantiles. If None, use a flattened
|
||||||
|
array.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
hdquantiles_sd : MaskedArray
|
||||||
|
Standard error of the Harrell-Davis quantile estimates.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _hdsd_1D(data,prob):
|
||||||
|
"Computes the std error for 1D arrays."
|
||||||
|
xsorted = np.sort(data.compressed())
|
||||||
|
n = len(xsorted)
|
||||||
|
#.........
|
||||||
|
hdsd = np.empty(len(prob), float_)
|
||||||
|
if n < 2:
|
||||||
|
hdsd.flat = np.nan
|
||||||
|
#.........
|
||||||
|
vv = np.arange(n) / float(n-1)
|
||||||
|
betacdf = beta.cdf
|
||||||
|
#
|
||||||
|
for (i,p) in enumerate(prob):
|
||||||
|
_w = betacdf(vv, (n+1)*p, (n+1)*(1-p))
|
||||||
|
w = _w[1:] - _w[:-1]
|
||||||
|
mx_ = np.fromiter([np.dot(w,xsorted[np.r_[list(range(0,k)),
|
||||||
|
list(range(k+1,n))].astype(int_)])
|
||||||
|
for k in range(n)], dtype=float_)
|
||||||
|
mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1)
|
||||||
|
hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n))
|
||||||
|
return hdsd
|
||||||
|
# Initialization & checks ---------
|
||||||
|
data = ma.array(data, copy=False, dtype=float_)
|
||||||
|
p = np.array(prob, copy=False, ndmin=1)
|
||||||
|
# Computes quantiles along axis (or globally)
|
||||||
|
if (axis is None):
|
||||||
|
result = _hdsd_1D(data, p)
|
||||||
|
else:
|
||||||
|
if data.ndim > 2:
|
||||||
|
raise ValueError("Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim)
|
||||||
|
result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
|
||||||
|
#
|
||||||
|
return ma.fix_invalid(result, copy=False).ravel()
|
||||||
|
|
||||||
|
|
||||||
|
#####--------------------------------------------------------------------------
|
||||||
|
#---- --- Confidence intervals ---
|
||||||
|
#####--------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
|
||||||
|
alpha=0.05, axis=None):
|
||||||
|
"""
|
||||||
|
Selected confidence interval of the trimmed mean along the given axis.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : array_like
|
||||||
|
Input data.
|
||||||
|
limits : {None, tuple}, optional
|
||||||
|
None or a two item tuple.
|
||||||
|
Tuple of the percentages to cut on each side of the array, with respect
|
||||||
|
to the number of unmasked data, as floats between 0. and 1. If ``n``
|
||||||
|
is the number of unmasked data before trimming, then
|
||||||
|
(``n`` * `limits[0]`)th smallest data and (``n`` * `limits[1]`)th
|
||||||
|
largest data are masked. The total number of unmasked data after
|
||||||
|
trimming is ``n`` * (1. - sum(`limits`)).
|
||||||
|
The value of one limit can be set to None to indicate an open interval.
|
||||||
|
|
||||||
|
Defaults to (0.2, 0.2).
|
||||||
|
inclusive : (2,) tuple of boolean, optional
|
||||||
|
If relative==False, tuple indicating whether values exactly equal to
|
||||||
|
the absolute limits are allowed.
|
||||||
|
If relative==True, tuple indicating whether the number of data being
|
||||||
|
masked on each side should be rounded (True) or truncated (False).
|
||||||
|
|
||||||
|
Defaults to (True, True).
|
||||||
|
alpha : float, optional
|
||||||
|
Confidence level of the intervals.
|
||||||
|
|
||||||
|
Defaults to 0.05.
|
||||||
|
axis : int, optional
|
||||||
|
Axis along which to cut. If None, uses a flattened version of `data`.
|
||||||
|
|
||||||
|
Defaults to None.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
trimmed_mean_ci : (2,) ndarray
|
||||||
|
The lower and upper confidence intervals of the trimmed data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
data = ma.array(data, copy=False)
|
||||||
|
trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
|
||||||
|
tmean = trimmed.mean(axis)
|
||||||
|
tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
|
||||||
|
df = trimmed.count(axis) - 1
|
||||||
|
tppf = t.ppf(1-alpha/2.,df)
|
||||||
|
return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
|
||||||
|
|
||||||
|
#..............................................................................
|
||||||
|
|
||||||
|
|
||||||
|
def mjci(data, prob=[0.25,0.5,0.75], axis=None):
|
||||||
|
"""
|
||||||
|
Returns the Maritz-Jarrett estimators of the standard error of selected
|
||||||
|
experimental quantiles of the data.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data: ndarray
|
||||||
|
Data array.
|
||||||
|
prob: sequence
|
||||||
|
Sequence of quantiles to compute.
|
||||||
|
axis : int
|
||||||
|
Axis along which to compute the quantiles. If None, use a flattened
|
||||||
|
array.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _mjci_1D(data, p):
|
||||||
|
data = np.sort(data.compressed())
|
||||||
|
n = data.size
|
||||||
|
prob = (np.array(p) * n + 0.5).astype(int_)
|
||||||
|
betacdf = beta.cdf
|
||||||
|
#
|
||||||
|
mj = np.empty(len(prob), float_)
|
||||||
|
x = np.arange(1,n+1, dtype=float_) / n
|
||||||
|
y = x - 1./n
|
||||||
|
for (i,m) in enumerate(prob):
|
||||||
|
(m1,m2) = (m-1, n-m)
|
||||||
|
W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
|
||||||
|
C1 = np.dot(W,data)
|
||||||
|
C2 = np.dot(W,data**2)
|
||||||
|
mj[i] = np.sqrt(C2 - C1**2)
|
||||||
|
return mj
|
||||||
|
#
|
||||||
|
data = ma.array(data, copy=False)
|
||||||
|
if data.ndim > 2:
|
||||||
|
raise ValueError("Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim)
|
||||||
|
p = np.array(prob, copy=False, ndmin=1)
|
||||||
|
# Computes quantiles along axis (or globally)
|
||||||
|
if (axis is None):
|
||||||
|
return _mjci_1D(data, p)
|
||||||
|
else:
|
||||||
|
return ma.apply_along_axis(_mjci_1D, axis, data, p)
|
||||||
|
|
||||||
|
#..............................................................................
|
||||||
|
|
||||||
|
|
||||||
|
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
|
||||||
|
"""
|
||||||
|
Computes the alpha confidence interval for the selected quantiles of the
|
||||||
|
data, with Maritz-Jarrett estimators.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : ndarray
|
||||||
|
Data array.
|
||||||
|
prob : sequence
|
||||||
|
Sequence of quantiles to compute.
|
||||||
|
alpha : float
|
||||||
|
Confidence level of the intervals.
|
||||||
|
axis : integer
|
||||||
|
Axis along which to compute the quantiles.
|
||||||
|
If None, use a flattened array.
|
||||||
|
|
||||||
|
"""
|
||||||
|
alpha = min(alpha, 1-alpha)
|
||||||
|
z = norm.ppf(1-alpha/2.)
|
||||||
|
xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
|
||||||
|
smj = mjci(data, prob, axis=axis)
|
||||||
|
return (xq - z * smj, xq + z * smj)
|
||||||
|
|
||||||
|
|
||||||
|
#.............................................................................
|
||||||
|
def median_cihs(data, alpha=0.05, axis=None):
|
||||||
|
"""
|
||||||
|
Computes the alpha-level confidence interval for the median of the data.
|
||||||
|
|
||||||
|
Uses the Hettmasperger-Sheather method.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : array_like
|
||||||
|
Input data. Masked values are discarded. The input should be 1D only,
|
||||||
|
or `axis` should be set to None.
|
||||||
|
alpha : float
|
||||||
|
Confidence level of the intervals.
|
||||||
|
axis : integer
|
||||||
|
Axis along which to compute the quantiles. If None, use a flattened
|
||||||
|
array.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
median_cihs :
|
||||||
|
Alpha level confidence interval.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _cihs_1D(data, alpha):
|
||||||
|
data = np.sort(data.compressed())
|
||||||
|
n = len(data)
|
||||||
|
alpha = min(alpha, 1-alpha)
|
||||||
|
k = int(binom._ppf(alpha/2., n, 0.5))
|
||||||
|
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||||
|
if gk < 1-alpha:
|
||||||
|
k -= 1
|
||||||
|
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||||
|
gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
|
||||||
|
I = (gk - 1 + alpha)/(gk - gkk)
|
||||||
|
lambd = (n-k) * I / float(k + (n-2*k)*I)
|
||||||
|
lims = (lambd*data[k] + (1-lambd)*data[k-1],
|
||||||
|
lambd*data[n-k-1] + (1-lambd)*data[n-k])
|
||||||
|
return lims
|
||||||
|
data = ma.rray(data, copy=False)
|
||||||
|
# Computes quantiles along axis (or globally)
|
||||||
|
if (axis is None):
|
||||||
|
result = _cihs_1D(data.compressed(), alpha)
|
||||||
|
else:
|
||||||
|
if data.ndim > 2:
|
||||||
|
raise ValueError("Array 'data' must be at most two dimensional, but got data.ndim = %d" % data.ndim)
|
||||||
|
result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
|
||||||
|
#
|
||||||
|
return result
|
||||||
|
|
||||||
|
#..............................................................................
|
||||||
|
|
||||||
|
|
||||||
|
def compare_medians_ms(group_1, group_2, axis=None):
|
||||||
|
"""
|
||||||
|
Compares the medians from two independent groups along the given axis.
|
||||||
|
|
||||||
|
The comparison is performed using the McKean-Schrader estimate of the
|
||||||
|
standard error of the medians.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
group_1 : array_like
|
||||||
|
First dataset.
|
||||||
|
group_2 : array_like
|
||||||
|
Second dataset.
|
||||||
|
axis : int, optional
|
||||||
|
Axis along which the medians are estimated. If None, the arrays are
|
||||||
|
flattened. If `axis` is not None, then `group_1` and `group_2`
|
||||||
|
should have the same shape.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
compare_medians_ms : {float, ndarray}
|
||||||
|
If `axis` is None, then returns a float, otherwise returns a 1-D
|
||||||
|
ndarray of floats with a length equal to the length of `group_1`
|
||||||
|
along `axis`.
|
||||||
|
|
||||||
|
"""
|
||||||
|
(med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
|
||||||
|
(std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
|
||||||
|
mstats.stde_median(group_2, axis=axis))
|
||||||
|
W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
|
||||||
|
return 1 - norm.cdf(W)
|
||||||
|
|
||||||
|
|
||||||
|
def idealfourths(data, axis=None):
|
||||||
|
"""
|
||||||
|
Returns an estimate of the lower and upper quartiles.
|
||||||
|
|
||||||
|
Uses the ideal fourths algorithm.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : array_like
|
||||||
|
Input array.
|
||||||
|
axis : int, optional
|
||||||
|
Axis along which the quartiles are estimated. If None, the arrays are
|
||||||
|
flattened.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
idealfourths : {list of floats, masked array}
|
||||||
|
Returns the two internal values that divide `data` into four parts
|
||||||
|
using the ideal fourths algorithm either along the flattened array
|
||||||
|
(if `axis` is None) or along `axis` of `data`.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _idf(data):
|
||||||
|
x = data.compressed()
|
||||||
|
n = len(x)
|
||||||
|
if n < 3:
|
||||||
|
return [np.nan,np.nan]
|
||||||
|
(j,h) = divmod(n/4. + 5/12.,1)
|
||||||
|
j = int(j)
|
||||||
|
qlo = (1-h)*x[j-1] + h*x[j]
|
||||||
|
k = n - j
|
||||||
|
qup = (1-h)*x[k] + h*x[k-1]
|
||||||
|
return [qlo, qup]
|
||||||
|
data = ma.sort(data, axis=axis).view(MaskedArray)
|
||||||
|
if (axis is None):
|
||||||
|
return _idf(data)
|
||||||
|
else:
|
||||||
|
return ma.apply_along_axis(_idf, axis, data)
|
||||||
|
|
||||||
|
|
||||||
|
def rsh(data, points=None):
|
||||||
|
"""
|
||||||
|
Evaluates Rosenblatt's shifted histogram estimators for each point
|
||||||
|
on the dataset 'data'.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : sequence
|
||||||
|
Input data. Masked values are ignored.
|
||||||
|
points : sequence
|
||||||
|
Sequence of points where to evaluate Rosenblatt shifted histogram.
|
||||||
|
If None, use the data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
data = ma.array(data, copy=False)
|
||||||
|
if points is None:
|
||||||
|
points = data
|
||||||
|
else:
|
||||||
|
points = np.array(points, copy=False, ndmin=1)
|
||||||
|
if data.ndim != 1:
|
||||||
|
raise AttributeError("The input array should be 1D only !")
|
||||||
|
n = data.count()
|
||||||
|
r = idealfourths(data, axis=None)
|
||||||
|
h = 1.2 * (r[-1]-r[0]) / n**(1./5)
|
||||||
|
nhi = (data[:,None] <= points[None,:] + h).sum(0)
|
||||||
|
nlo = (data[:,None] < points[None,:] - h).sum(0)
|
||||||
|
return (nhi-nlo) / (2.*n*h)
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
@ -0,0 +1,76 @@
|
|||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
from numpy import vectorize, deprecate
|
||||||
|
from numpy.random import random_sample
|
||||||
|
|
||||||
|
__all__ = ['randwppf', 'randwcdf']
|
||||||
|
|
||||||
|
# XXX: Are these needed anymore?
|
||||||
|
|
||||||
|
#####################################
|
||||||
|
# General purpose continuous
|
||||||
|
######################################
|
||||||
|
|
||||||
|
|
||||||
|
@deprecate(message="Deprecated in scipy 0.14.0, use "
|
||||||
|
"distribution-specific rvs() method instead")
|
||||||
|
def randwppf(ppf, args=(), size=None):
|
||||||
|
"""
|
||||||
|
returns an array of randomly distributed integers of a distribution
|
||||||
|
whose percent point function (inverse of the CDF or quantile function)
|
||||||
|
is given.
|
||||||
|
|
||||||
|
args is a tuple of extra arguments to the ppf function (i.e. shape,
|
||||||
|
location, scale), and size is the size of the output. Note the ppf
|
||||||
|
function must accept an array of q values to compute over.
|
||||||
|
|
||||||
|
"""
|
||||||
|
U = random_sample(size=size)
|
||||||
|
return ppf(*(U,)+args)
|
||||||
|
|
||||||
|
|
||||||
|
@deprecate(message="Deprecated in scipy 0.14.0, use "
|
||||||
|
"distribution-specific rvs() method instead")
|
||||||
|
def randwcdf(cdf, mean=1.0, args=(), size=None):
|
||||||
|
"""
|
||||||
|
Returns an array of randomly distributed integers given a CDF.
|
||||||
|
|
||||||
|
Given a cumulative distribution function (CDF) returns an array of
|
||||||
|
randomly distributed integers that would satisfy the CDF.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cdf : function
|
||||||
|
CDF function that accepts a single value and `args`, and returns
|
||||||
|
an single value.
|
||||||
|
mean : float, optional
|
||||||
|
The mean of the distribution which helps the solver. Defaults
|
||||||
|
to 1.0.
|
||||||
|
args : tuple, optional
|
||||||
|
Extra arguments to the cdf function (i.e. shape, location, scale)
|
||||||
|
size : {int, None}, optional
|
||||||
|
Is the size of the output. If None, only 1 value will be returned.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
randwcdf : ndarray
|
||||||
|
Array of random numbers.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
Can use the ``scipy.stats.distributions.*.cdf`` functions for the
|
||||||
|
`cdf` parameter.
|
||||||
|
|
||||||
|
"""
|
||||||
|
import scipy.optimize as optimize
|
||||||
|
|
||||||
|
def _ppfopt(x, q, *nargs):
|
||||||
|
newargs = (x,)+nargs
|
||||||
|
return cdf(*newargs) - q
|
||||||
|
|
||||||
|
def _ppf(q, *nargs):
|
||||||
|
return optimize.fsolve(_ppfopt, mean, args=(q,)+nargs)
|
||||||
|
|
||||||
|
_vppf = vectorize(_ppf)
|
||||||
|
U = random_sample(size=size)
|
||||||
|
return _vppf(*(U,)+args)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,154 @@
|
|||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
import inspect
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import numpy.testing as npt
|
||||||
|
|
||||||
|
#from scipy.lib._version import NumpyVersion
|
||||||
|
from scipy import stats
|
||||||
|
|
||||||
|
|
||||||
|
#NUMPY_BELOW_1_7 = NumpyVersion(np.__version__) < '1.7.0'
|
||||||
|
NUMPY_BELOW_1_7 =np.__version__ < '1.7.0'
|
||||||
|
|
||||||
|
|
||||||
|
def check_normalization(distfn, args, distname):
|
||||||
|
norm_moment = distfn.moment(0, *args)
|
||||||
|
npt.assert_allclose(norm_moment, 1.0)
|
||||||
|
|
||||||
|
# this is a temporary plug: either ncf or expect is problematic;
|
||||||
|
# best be marked as a knownfail, but I've no clue how to do it.
|
||||||
|
if distname == "ncf":
|
||||||
|
atol, rtol = 1e-5, 0
|
||||||
|
else:
|
||||||
|
atol, rtol = 1e-7, 1e-7
|
||||||
|
|
||||||
|
normalization_expect = distfn.expect(lambda x: 1, args=args)
|
||||||
|
npt.assert_allclose(normalization_expect, 1.0, atol=atol, rtol=rtol,
|
||||||
|
err_msg=distname, verbose=True)
|
||||||
|
|
||||||
|
normalization_cdf = distfn.cdf(distfn.b, *args)
|
||||||
|
npt.assert_allclose(normalization_cdf, 1.0)
|
||||||
|
|
||||||
|
|
||||||
|
def check_moment(distfn, arg, m, v, msg):
|
||||||
|
m1 = distfn.moment(1, *arg)
|
||||||
|
m2 = distfn.moment(2, *arg)
|
||||||
|
if not np.isinf(m):
|
||||||
|
npt.assert_almost_equal(m1, m, decimal=10, err_msg=msg +
|
||||||
|
' - 1st moment')
|
||||||
|
else: # or np.isnan(m1),
|
||||||
|
npt.assert_(np.isinf(m1),
|
||||||
|
msg + ' - 1st moment -infinite, m1=%s' % str(m1))
|
||||||
|
|
||||||
|
if not np.isinf(v):
|
||||||
|
npt.assert_almost_equal(m2 - m1 * m1, v, decimal=10, err_msg=msg +
|
||||||
|
' - 2ndt moment')
|
||||||
|
else: # or np.isnan(m2),
|
||||||
|
npt.assert_(np.isinf(m2),
|
||||||
|
msg + ' - 2nd moment -infinite, m2=%s' % str(m2))
|
||||||
|
|
||||||
|
|
||||||
|
def check_mean_expect(distfn, arg, m, msg):
|
||||||
|
if np.isfinite(m):
|
||||||
|
m1 = distfn.expect(lambda x: x, arg)
|
||||||
|
npt.assert_almost_equal(m1, m, decimal=5, err_msg=msg +
|
||||||
|
' - 1st moment (expect)')
|
||||||
|
|
||||||
|
|
||||||
|
def check_var_expect(distfn, arg, m, v, msg):
|
||||||
|
if np.isfinite(v):
|
||||||
|
m2 = distfn.expect(lambda x: x*x, arg)
|
||||||
|
npt.assert_almost_equal(m2, v + m*m, decimal=5, err_msg=msg +
|
||||||
|
' - 2st moment (expect)')
|
||||||
|
|
||||||
|
|
||||||
|
def check_skew_expect(distfn, arg, m, v, s, msg):
|
||||||
|
if np.isfinite(s):
|
||||||
|
m3e = distfn.expect(lambda x: np.power(x-m, 3), arg)
|
||||||
|
npt.assert_almost_equal(m3e, s * np.power(v, 1.5),
|
||||||
|
decimal=5, err_msg=msg + ' - skew')
|
||||||
|
else:
|
||||||
|
npt.assert_(np.isnan(s))
|
||||||
|
|
||||||
|
|
||||||
|
def check_kurt_expect(distfn, arg, m, v, k, msg):
|
||||||
|
if np.isfinite(k):
|
||||||
|
m4e = distfn.expect(lambda x: np.power(x-m, 4), arg)
|
||||||
|
npt.assert_allclose(m4e, (k + 3.) * np.power(v, 2), atol=1e-5, rtol=1e-5,
|
||||||
|
err_msg=msg + ' - kurtosis')
|
||||||
|
else:
|
||||||
|
npt.assert_(np.isnan(k))
|
||||||
|
|
||||||
|
|
||||||
|
def check_entropy(distfn, arg, msg):
|
||||||
|
ent = distfn.entropy(*arg)
|
||||||
|
npt.assert_(not np.isnan(ent), msg + 'test Entropy is nan')
|
||||||
|
|
||||||
|
|
||||||
|
def check_private_entropy(distfn, args, superclass):
|
||||||
|
# compare a generic _entropy with the distribution-specific implementation
|
||||||
|
npt.assert_allclose(distfn._entropy(*args),
|
||||||
|
superclass._entropy(distfn, *args))
|
||||||
|
|
||||||
|
|
||||||
|
def check_edge_support(distfn, args):
|
||||||
|
# Make sure the x=self.a and self.b are handled correctly.
|
||||||
|
x = [distfn.a, distfn.b]
|
||||||
|
if isinstance(distfn, stats.rv_continuous):
|
||||||
|
npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0])
|
||||||
|
npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0])
|
||||||
|
|
||||||
|
npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0])
|
||||||
|
npt.assert_equal(distfn.logsf(x, *args), [0.0, -np.inf])
|
||||||
|
|
||||||
|
if isinstance(distfn, stats.rv_discrete):
|
||||||
|
x = [distfn.a - 1, distfn.b]
|
||||||
|
npt.assert_equal(distfn.ppf([0.0, 1.0], *args), x)
|
||||||
|
npt.assert_equal(distfn.isf([0.0, 1.0], *args), x[::-1])
|
||||||
|
|
||||||
|
# out-of-bounds for isf & ppf
|
||||||
|
npt.assert_(np.isnan(distfn.isf([-1, 2], *args)).all())
|
||||||
|
npt.assert_(np.isnan(distfn.ppf([-1, 2], *args)).all())
|
||||||
|
|
||||||
|
|
||||||
|
def check_named_args(distfn, x, shape_args, defaults, meths):
|
||||||
|
## Check calling w/ named arguments.
|
||||||
|
|
||||||
|
# check consistency of shapes, numargs and _parse signature
|
||||||
|
signature = inspect.getargspec(distfn._parse_args)
|
||||||
|
npt.assert_(signature.varargs is None)
|
||||||
|
npt.assert_(signature.keywords is None)
|
||||||
|
npt.assert_(signature.defaults == defaults)
|
||||||
|
|
||||||
|
shape_argnames = signature.args[1:-len(defaults)] # self, a, b, loc=0, scale=1
|
||||||
|
if distfn.shapes:
|
||||||
|
shapes_ = distfn.shapes.replace(',', ' ').split()
|
||||||
|
else:
|
||||||
|
shapes_ = ''
|
||||||
|
npt.assert_(len(shapes_) == distfn.numargs)
|
||||||
|
npt.assert_(len(shapes_) == len(shape_argnames))
|
||||||
|
|
||||||
|
# check calling w/ named arguments
|
||||||
|
shape_args = list(shape_args)
|
||||||
|
|
||||||
|
vals = [meth(x, *shape_args) for meth in meths]
|
||||||
|
npt.assert_(np.all(np.isfinite(vals)))
|
||||||
|
|
||||||
|
names, a, k = shape_argnames[:], shape_args[:], {}
|
||||||
|
while names:
|
||||||
|
k.update({names.pop(): a.pop()})
|
||||||
|
v = [meth(x, *a, **k) for meth in meths]
|
||||||
|
npt.assert_array_equal(vals, v)
|
||||||
|
if not 'n' in k.keys():
|
||||||
|
# `n` is first parameter of moment(), so can't be used as named arg
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("ignore", UserWarning)
|
||||||
|
npt.assert_equal(distfn.moment(1, *a, **k),
|
||||||
|
distfn.moment(1, *shape_args))
|
||||||
|
|
||||||
|
# unknown arguments should not go through:
|
||||||
|
k.update({'kaboom': 42})
|
||||||
|
npt.assert_raises(TypeError, distfn.cdf, x, **k)
|
@ -0,0 +1,238 @@
|
|||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from numpy.testing import assert_array_almost_equal, run_module_suite
|
||||||
|
from scipy.stats import \
|
||||||
|
binned_statistic, binned_statistic_2d, binned_statistic_dd
|
||||||
|
|
||||||
|
|
||||||
|
class TestBinnedStatistic(object):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setup_class(cls):
|
||||||
|
np.random.seed(9865)
|
||||||
|
cls.x = np.random.random(100)
|
||||||
|
cls.y = np.random.random(100)
|
||||||
|
cls.v = np.random.random(100)
|
||||||
|
cls.X = np.random.random((100, 3))
|
||||||
|
|
||||||
|
def test_1d_count(self):
|
||||||
|
x = self.x
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
count1, edges1, bc = binned_statistic(x, v, 'count', bins=10)
|
||||||
|
count2, edges2 = np.histogram(x, bins=10)
|
||||||
|
|
||||||
|
assert_array_almost_equal(count1, count2)
|
||||||
|
assert_array_almost_equal(edges1, edges2)
|
||||||
|
|
||||||
|
def test_1d_sum(self):
|
||||||
|
x = self.x
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
sum1, edges1, bc = binned_statistic(x, v, 'sum', bins=10)
|
||||||
|
sum2, edges2 = np.histogram(x, bins=10, weights=v)
|
||||||
|
|
||||||
|
assert_array_almost_equal(sum1, sum2)
|
||||||
|
assert_array_almost_equal(edges1, edges2)
|
||||||
|
|
||||||
|
def test_1d_mean(self):
|
||||||
|
x = self.x
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
stat1, edges1, bc = binned_statistic(x, v, 'mean', bins=10)
|
||||||
|
stat2, edges2, bc = binned_statistic(x, v, np.mean, bins=10)
|
||||||
|
|
||||||
|
assert_array_almost_equal(stat1, stat2)
|
||||||
|
assert_array_almost_equal(edges1, edges2)
|
||||||
|
|
||||||
|
def test_1d_std(self):
|
||||||
|
x = self.x
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
stat1, edges1, bc = binned_statistic(x, v, 'std', bins=10)
|
||||||
|
stat2, edges2, bc = binned_statistic(x, v, np.std, bins=10)
|
||||||
|
|
||||||
|
assert_array_almost_equal(stat1, stat2)
|
||||||
|
assert_array_almost_equal(edges1, edges2)
|
||||||
|
|
||||||
|
def test_1d_median(self):
|
||||||
|
x = self.x
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
stat1, edges1, bc = binned_statistic(x, v, 'median', bins=10)
|
||||||
|
stat2, edges2, bc = binned_statistic(x, v, np.median, bins=10)
|
||||||
|
|
||||||
|
assert_array_almost_equal(stat1, stat2)
|
||||||
|
assert_array_almost_equal(edges1, edges2)
|
||||||
|
|
||||||
|
def test_1d_bincode(self):
|
||||||
|
x = self.x[:20]
|
||||||
|
v = self.v[:20]
|
||||||
|
|
||||||
|
count1, edges1, bc = binned_statistic(x, v, 'count', bins=3)
|
||||||
|
bc2 = np.array([3, 2, 1, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 1, 2, 3, 1,
|
||||||
|
1, 2, 1])
|
||||||
|
|
||||||
|
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
||||||
|
|
||||||
|
assert_array_almost_equal(bc, bc2)
|
||||||
|
assert_array_almost_equal(bcount, count1)
|
||||||
|
|
||||||
|
def test_1d_range_keyword(self):
|
||||||
|
# Regression test for gh-3063, range can be (min, max) or [(min, max)]
|
||||||
|
np.random.seed(9865)
|
||||||
|
x = np.arange(30)
|
||||||
|
data = np.random.random(30)
|
||||||
|
|
||||||
|
mean, bins, _ = binned_statistic(x[:15], data[:15])
|
||||||
|
mean_range, bins_range, _ = binned_statistic(x, data, range=[(0, 14)])
|
||||||
|
mean_range2, bins_range2, _ = binned_statistic(x, data, range=(0, 14))
|
||||||
|
|
||||||
|
assert_array_almost_equal(mean, mean_range)
|
||||||
|
assert_array_almost_equal(bins, bins_range)
|
||||||
|
assert_array_almost_equal(mean, mean_range2)
|
||||||
|
assert_array_almost_equal(bins, bins_range2)
|
||||||
|
|
||||||
|
def test_2d_count(self):
|
||||||
|
x = self.x
|
||||||
|
y = self.y
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
count1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'count', bins=5)
|
||||||
|
count2, binx2, biny2 = np.histogram2d(x, y, bins=5)
|
||||||
|
|
||||||
|
assert_array_almost_equal(count1, count2)
|
||||||
|
assert_array_almost_equal(binx1, binx2)
|
||||||
|
assert_array_almost_equal(biny1, biny2)
|
||||||
|
|
||||||
|
def test_2d_sum(self):
|
||||||
|
x = self.x
|
||||||
|
y = self.y
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
sum1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'sum', bins=5)
|
||||||
|
sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v)
|
||||||
|
|
||||||
|
assert_array_almost_equal(sum1, sum2)
|
||||||
|
assert_array_almost_equal(binx1, binx2)
|
||||||
|
assert_array_almost_equal(biny1, biny2)
|
||||||
|
|
||||||
|
def test_2d_mean(self):
|
||||||
|
x = self.x
|
||||||
|
y = self.y
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'mean', bins=5)
|
||||||
|
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
|
||||||
|
|
||||||
|
assert_array_almost_equal(stat1, stat2)
|
||||||
|
assert_array_almost_equal(binx1, binx2)
|
||||||
|
assert_array_almost_equal(biny1, biny2)
|
||||||
|
|
||||||
|
def test_2d_std(self):
|
||||||
|
x = self.x
|
||||||
|
y = self.y
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'std', bins=5)
|
||||||
|
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.std, bins=5)
|
||||||
|
|
||||||
|
assert_array_almost_equal(stat1, stat2)
|
||||||
|
assert_array_almost_equal(binx1, binx2)
|
||||||
|
assert_array_almost_equal(biny1, biny2)
|
||||||
|
|
||||||
|
def test_2d_median(self):
|
||||||
|
x = self.x
|
||||||
|
y = self.y
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'median', bins=5)
|
||||||
|
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.median, bins=5)
|
||||||
|
|
||||||
|
assert_array_almost_equal(stat1, stat2)
|
||||||
|
assert_array_almost_equal(binx1, binx2)
|
||||||
|
assert_array_almost_equal(biny1, biny2)
|
||||||
|
|
||||||
|
def test_2d_bincode(self):
|
||||||
|
x = self.x[:20]
|
||||||
|
y = self.y[:20]
|
||||||
|
v = self.v[:20]
|
||||||
|
|
||||||
|
count1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'count', bins=3)
|
||||||
|
bc2 = np.array([17, 11, 6, 16, 11, 17, 18, 17, 17, 7, 6, 18, 16,
|
||||||
|
6, 11, 16, 6, 6, 11, 8])
|
||||||
|
|
||||||
|
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
||||||
|
|
||||||
|
assert_array_almost_equal(bc, bc2)
|
||||||
|
count1adj = count1[count1.nonzero()]
|
||||||
|
assert_array_almost_equal(bcount, count1adj)
|
||||||
|
|
||||||
|
def test_dd_count(self):
|
||||||
|
X = self.X
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
|
||||||
|
count2, edges2 = np.histogramdd(X, bins=3)
|
||||||
|
|
||||||
|
assert_array_almost_equal(count1, count2)
|
||||||
|
assert_array_almost_equal(edges1, edges2)
|
||||||
|
|
||||||
|
def test_dd_sum(self):
|
||||||
|
X = self.X
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
sum1, edges1, bc = binned_statistic_dd(X, v, 'sum', bins=3)
|
||||||
|
sum2, edges2 = np.histogramdd(X, bins=3, weights=v)
|
||||||
|
|
||||||
|
assert_array_almost_equal(sum1, sum2)
|
||||||
|
assert_array_almost_equal(edges1, edges2)
|
||||||
|
|
||||||
|
def test_dd_mean(self):
|
||||||
|
X = self.X
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
stat1, edges1, bc = binned_statistic_dd(X, v, 'mean', bins=3)
|
||||||
|
stat2, edges2, bc = binned_statistic_dd(X, v, np.mean, bins=3)
|
||||||
|
|
||||||
|
assert_array_almost_equal(stat1, stat2)
|
||||||
|
assert_array_almost_equal(edges1, edges2)
|
||||||
|
|
||||||
|
def test_dd_std(self):
|
||||||
|
X = self.X
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
stat1, edges1, bc = binned_statistic_dd(X, v, 'std', bins=3)
|
||||||
|
stat2, edges2, bc = binned_statistic_dd(X, v, np.std, bins=3)
|
||||||
|
|
||||||
|
assert_array_almost_equal(stat1, stat2)
|
||||||
|
assert_array_almost_equal(edges1, edges2)
|
||||||
|
|
||||||
|
def test_dd_median(self):
|
||||||
|
X = self.X
|
||||||
|
v = self.v
|
||||||
|
|
||||||
|
stat1, edges1, bc = binned_statistic_dd(X, v, 'median', bins=3)
|
||||||
|
stat2, edges2, bc = binned_statistic_dd(X, v, np.median, bins=3)
|
||||||
|
|
||||||
|
assert_array_almost_equal(stat1, stat2)
|
||||||
|
assert_array_almost_equal(edges1, edges2)
|
||||||
|
|
||||||
|
def test_dd_bincode(self):
|
||||||
|
X = self.X[:20]
|
||||||
|
v = self.v[:20]
|
||||||
|
|
||||||
|
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
|
||||||
|
bc2 = np.array([63, 33, 86, 83, 88, 67, 57, 33, 42, 41, 82, 83, 92,
|
||||||
|
32, 36, 91, 43, 87, 81, 81])
|
||||||
|
|
||||||
|
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
||||||
|
|
||||||
|
assert_array_almost_equal(bc, bc2)
|
||||||
|
count1adj = count1[count1.nonzero()]
|
||||||
|
assert_array_almost_equal(bcount, count1adj)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_module_suite()
|
@ -0,0 +1,202 @@
|
|||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from numpy.testing import (run_module_suite, assert_equal, assert_array_equal,
|
||||||
|
assert_array_almost_equal, assert_approx_equal, assert_raises,
|
||||||
|
assert_allclose)
|
||||||
|
from scipy.special import xlogy
|
||||||
|
from scipy.stats.contingency import margins, expected_freq, chi2_contingency
|
||||||
|
|
||||||
|
|
||||||
|
def test_margins():
|
||||||
|
a = np.array([1])
|
||||||
|
m = margins(a)
|
||||||
|
assert_equal(len(m), 1)
|
||||||
|
m0 = m[0]
|
||||||
|
assert_array_equal(m0, np.array([1]))
|
||||||
|
|
||||||
|
a = np.array([[1]])
|
||||||
|
m0, m1 = margins(a)
|
||||||
|
expected0 = np.array([[1]])
|
||||||
|
expected1 = np.array([[1]])
|
||||||
|
assert_array_equal(m0, expected0)
|
||||||
|
assert_array_equal(m1, expected1)
|
||||||
|
|
||||||
|
a = np.arange(12).reshape(2, 6)
|
||||||
|
m0, m1 = margins(a)
|
||||||
|
expected0 = np.array([[15], [51]])
|
||||||
|
expected1 = np.array([[6, 8, 10, 12, 14, 16]])
|
||||||
|
assert_array_equal(m0, expected0)
|
||||||
|
assert_array_equal(m1, expected1)
|
||||||
|
|
||||||
|
a = np.arange(24).reshape(2, 3, 4)
|
||||||
|
m0, m1, m2 = margins(a)
|
||||||
|
expected0 = np.array([[[66]], [[210]]])
|
||||||
|
expected1 = np.array([[[60], [92], [124]]])
|
||||||
|
expected2 = np.array([[[60, 66, 72, 78]]])
|
||||||
|
assert_array_equal(m0, expected0)
|
||||||
|
assert_array_equal(m1, expected1)
|
||||||
|
assert_array_equal(m2, expected2)
|
||||||
|
|
||||||
|
|
||||||
|
def test_expected_freq():
|
||||||
|
assert_array_equal(expected_freq([1]), np.array([1.0]))
|
||||||
|
|
||||||
|
observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
|
||||||
|
e = expected_freq(observed)
|
||||||
|
assert_array_equal(e, np.ones_like(observed))
|
||||||
|
|
||||||
|
observed = np.array([[10, 10, 20], [20, 20, 20]])
|
||||||
|
e = expected_freq(observed)
|
||||||
|
correct = np.array([[12., 12., 16.], [18., 18., 24.]])
|
||||||
|
assert_array_almost_equal(e, correct)
|
||||||
|
|
||||||
|
|
||||||
|
def test_chi2_contingency_trivial():
|
||||||
|
# Some very simple tests for chi2_contingency.
|
||||||
|
|
||||||
|
# A trivial case
|
||||||
|
obs = np.array([[1, 2], [1, 2]])
|
||||||
|
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
||||||
|
assert_equal(chi2, 0.0)
|
||||||
|
assert_equal(p, 1.0)
|
||||||
|
assert_equal(dof, 1)
|
||||||
|
assert_array_equal(obs, expected)
|
||||||
|
|
||||||
|
# A *really* trivial case: 1-D data.
|
||||||
|
obs = np.array([1, 2, 3])
|
||||||
|
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
||||||
|
assert_equal(chi2, 0.0)
|
||||||
|
assert_equal(p, 1.0)
|
||||||
|
assert_equal(dof, 0)
|
||||||
|
assert_array_equal(obs, expected)
|
||||||
|
|
||||||
|
|
||||||
|
def test_chi2_contingency_R():
|
||||||
|
# Some test cases that were computed independently, using R.
|
||||||
|
|
||||||
|
Rcode = \
|
||||||
|
"""
|
||||||
|
# Data vector.
|
||||||
|
data <- c(
|
||||||
|
12, 34, 23, 4, 47, 11,
|
||||||
|
35, 31, 11, 34, 10, 18,
|
||||||
|
12, 32, 9, 18, 13, 19,
|
||||||
|
12, 12, 14, 9, 33, 25
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create factor tags:r=rows, c=columns, t=tiers
|
||||||
|
r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
|
||||||
|
c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
|
||||||
|
t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
|
||||||
|
|
||||||
|
# 3-way Chi squared test of independence
|
||||||
|
s = summary(xtabs(data~r+c+t))
|
||||||
|
print(s)
|
||||||
|
"""
|
||||||
|
Routput = \
|
||||||
|
"""
|
||||||
|
Call: xtabs(formula = data ~ r + c + t)
|
||||||
|
Number of cases in table: 478
|
||||||
|
Number of factors: 3
|
||||||
|
Test for independence of all factors:
|
||||||
|
Chisq = 102.17, df = 17, p-value = 3.514e-14
|
||||||
|
"""
|
||||||
|
obs = np.array(
|
||||||
|
[[[12, 34, 23],
|
||||||
|
[35, 31, 11],
|
||||||
|
[12, 32, 9],
|
||||||
|
[12, 12, 14]],
|
||||||
|
[[4, 47, 11],
|
||||||
|
[34, 10, 18],
|
||||||
|
[18, 13, 19],
|
||||||
|
[9, 33, 25]]])
|
||||||
|
chi2, p, dof, expected = chi2_contingency(obs)
|
||||||
|
assert_approx_equal(chi2, 102.17, significant=5)
|
||||||
|
assert_approx_equal(p, 3.514e-14, significant=4)
|
||||||
|
assert_equal(dof, 17)
|
||||||
|
|
||||||
|
Rcode = \
|
||||||
|
"""
|
||||||
|
# Data vector.
|
||||||
|
data <- c(
|
||||||
|
#
|
||||||
|
12, 17,
|
||||||
|
11, 16,
|
||||||
|
#
|
||||||
|
11, 12,
|
||||||
|
15, 16,
|
||||||
|
#
|
||||||
|
23, 15,
|
||||||
|
30, 22,
|
||||||
|
#
|
||||||
|
14, 17,
|
||||||
|
15, 16
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
|
||||||
|
r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
|
||||||
|
c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
|
||||||
|
d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
|
||||||
|
t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
|
||||||
|
|
||||||
|
# 4-way Chi squared test of independence
|
||||||
|
s = summary(xtabs(data~r+c+d+t))
|
||||||
|
print(s)
|
||||||
|
"""
|
||||||
|
Routput = \
|
||||||
|
"""
|
||||||
|
Call: xtabs(formula = data ~ r + c + d + t)
|
||||||
|
Number of cases in table: 262
|
||||||
|
Number of factors: 4
|
||||||
|
Test for independence of all factors:
|
||||||
|
Chisq = 8.758, df = 11, p-value = 0.6442
|
||||||
|
"""
|
||||||
|
obs = np.array(
|
||||||
|
[[[[12, 17],
|
||||||
|
[11, 16]],
|
||||||
|
[[11, 12],
|
||||||
|
[15, 16]]],
|
||||||
|
[[[23, 15],
|
||||||
|
[30, 22]],
|
||||||
|
[[14, 17],
|
||||||
|
[15, 16]]]])
|
||||||
|
chi2, p, dof, expected = chi2_contingency(obs)
|
||||||
|
assert_approx_equal(chi2, 8.758, significant=4)
|
||||||
|
assert_approx_equal(p, 0.6442, significant=4)
|
||||||
|
assert_equal(dof, 11)
|
||||||
|
|
||||||
|
|
||||||
|
def test_chi2_contingency_g():
|
||||||
|
c = np.array([[15, 60], [15, 90]])
|
||||||
|
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False)
|
||||||
|
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
||||||
|
|
||||||
|
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True)
|
||||||
|
c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
|
||||||
|
assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
|
||||||
|
|
||||||
|
c = np.array([[10, 12, 10], [12, 10, 10]])
|
||||||
|
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
|
||||||
|
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
||||||
|
|
||||||
|
|
||||||
|
def test_chi2_contingency_bad_args():
|
||||||
|
# Test that "bad" inputs raise a ValueError.
|
||||||
|
|
||||||
|
# Negative value in the array of observed frequencies.
|
||||||
|
obs = np.array([[-1, 10], [1, 2]])
|
||||||
|
assert_raises(ValueError, chi2_contingency, obs)
|
||||||
|
|
||||||
|
# The zeros in this will result in zeros in the array
|
||||||
|
# of expected frequencies.
|
||||||
|
obs = np.array([[0, 1], [0, 1]])
|
||||||
|
assert_raises(ValueError, chi2_contingency, obs)
|
||||||
|
|
||||||
|
# A degenerate case: `observed` has size 0.
|
||||||
|
obs = np.empty((0, 8))
|
||||||
|
assert_raises(ValueError, chi2_contingency, obs)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_module_suite()
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,789 @@
|
|||||||
|
# Author: Travis Oliphant, 2002
|
||||||
|
#
|
||||||
|
# Further enhancements and tests added by numerous SciPy developers.
|
||||||
|
#
|
||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from numpy.random import RandomState
|
||||||
|
from numpy.testing import (TestCase, run_module_suite, assert_array_equal,
|
||||||
|
assert_almost_equal, assert_array_less, assert_array_almost_equal,
|
||||||
|
assert_raises, assert_, assert_allclose, assert_equal, dec)
|
||||||
|
|
||||||
|
from scipy import stats
|
||||||
|
|
||||||
|
# Matplotlib is not a scipy dependency but is optionally used in probplot, so
|
||||||
|
# check if it's available
|
||||||
|
try:
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
have_matplotlib = True
|
||||||
|
except:
|
||||||
|
have_matplotlib = False
|
||||||
|
|
||||||
|
|
||||||
|
g1 = [1.006, 0.996, 0.998, 1.000, 0.992, 0.993, 1.002, 0.999, 0.994, 1.000]
|
||||||
|
g2 = [0.998, 1.006, 1.000, 1.002, 0.997, 0.998, 0.996, 1.000, 1.006, 0.988]
|
||||||
|
g3 = [0.991, 0.987, 0.997, 0.999, 0.995, 0.994, 1.000, 0.999, 0.996, 0.996]
|
||||||
|
g4 = [1.005, 1.002, 0.994, 1.000, 0.995, 0.994, 0.998, 0.996, 1.002, 0.996]
|
||||||
|
g5 = [0.998, 0.998, 0.982, 0.990, 1.002, 0.984, 0.996, 0.993, 0.980, 0.996]
|
||||||
|
g6 = [1.009, 1.013, 1.009, 0.997, 0.988, 1.002, 0.995, 0.998, 0.981, 0.996]
|
||||||
|
g7 = [0.990, 1.004, 0.996, 1.001, 0.998, 1.000, 1.018, 1.010, 0.996, 1.002]
|
||||||
|
g8 = [0.998, 1.000, 1.006, 1.000, 1.002, 0.996, 0.998, 0.996, 1.002, 1.006]
|
||||||
|
g9 = [1.002, 0.998, 0.996, 0.995, 0.996, 1.004, 1.004, 0.998, 0.999, 0.991]
|
||||||
|
g10 = [0.991, 0.995, 0.984, 0.994, 0.997, 0.997, 0.991, 0.998, 1.004, 0.997]
|
||||||
|
|
||||||
|
|
||||||
|
class TestShapiro(TestCase):
|
||||||
|
def test_basic(self):
|
||||||
|
x1 = [0.11,7.87,4.61,10.14,7.95,3.14,0.46,
|
||||||
|
4.43,0.21,4.75,0.71,1.52,3.24,
|
||||||
|
0.93,0.42,4.97,9.53,4.55,0.47,6.66]
|
||||||
|
w,pw = stats.shapiro(x1)
|
||||||
|
assert_almost_equal(w,0.90047299861907959,6)
|
||||||
|
assert_almost_equal(pw,0.042089745402336121,6)
|
||||||
|
x2 = [1.36,1.14,2.92,2.55,1.46,1.06,5.27,-1.11,
|
||||||
|
3.48,1.10,0.88,-0.51,1.46,0.52,6.20,1.69,
|
||||||
|
0.08,3.67,2.81,3.49]
|
||||||
|
w,pw = stats.shapiro(x2)
|
||||||
|
assert_almost_equal(w,0.9590270,6)
|
||||||
|
assert_almost_equal(pw,0.52460,3)
|
||||||
|
|
||||||
|
def test_bad_arg(self):
|
||||||
|
# Length of x is less than 3.
|
||||||
|
x = [1]
|
||||||
|
assert_raises(ValueError, stats.shapiro, x)
|
||||||
|
|
||||||
|
|
||||||
|
class TestAnderson(TestCase):
|
||||||
|
def test_normal(self):
|
||||||
|
rs = RandomState(1234567890)
|
||||||
|
x1 = rs.standard_exponential(size=50)
|
||||||
|
x2 = rs.standard_normal(size=50)
|
||||||
|
A,crit,sig = stats.anderson(x1)
|
||||||
|
assert_array_less(crit[:-1], A)
|
||||||
|
A,crit,sig = stats.anderson(x2)
|
||||||
|
assert_array_less(A, crit[-2:])
|
||||||
|
|
||||||
|
def test_expon(self):
|
||||||
|
rs = RandomState(1234567890)
|
||||||
|
x1 = rs.standard_exponential(size=50)
|
||||||
|
x2 = rs.standard_normal(size=50)
|
||||||
|
A,crit,sig = stats.anderson(x1,'expon')
|
||||||
|
assert_array_less(A, crit[-2:])
|
||||||
|
olderr = np.seterr(all='ignore')
|
||||||
|
try:
|
||||||
|
A,crit,sig = stats.anderson(x2,'expon')
|
||||||
|
finally:
|
||||||
|
np.seterr(**olderr)
|
||||||
|
assert_(A > crit[-1])
|
||||||
|
|
||||||
|
def test_bad_arg(self):
|
||||||
|
assert_raises(ValueError, stats.anderson, [1], dist='plate_of_shrimp')
|
||||||
|
|
||||||
|
|
||||||
|
class TestAnsari(TestCase):
|
||||||
|
|
||||||
|
def test_small(self):
|
||||||
|
x = [1,2,3,3,4]
|
||||||
|
y = [3,2,6,1,6,1,4,1]
|
||||||
|
W, pval = stats.ansari(x,y)
|
||||||
|
assert_almost_equal(W,23.5,11)
|
||||||
|
assert_almost_equal(pval,0.13499256881897437,11)
|
||||||
|
|
||||||
|
def test_approx(self):
|
||||||
|
ramsay = np.array((111, 107, 100, 99, 102, 106, 109, 108, 104, 99,
|
||||||
|
101, 96, 97, 102, 107, 113, 116, 113, 110, 98))
|
||||||
|
parekh = np.array((107, 108, 106, 98, 105, 103, 110, 105, 104,
|
||||||
|
100, 96, 108, 103, 104, 114, 114, 113, 108, 106, 99))
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings('ignore',
|
||||||
|
message="Ties preclude use of exact statistic.")
|
||||||
|
W, pval = stats.ansari(ramsay, parekh)
|
||||||
|
|
||||||
|
assert_almost_equal(W,185.5,11)
|
||||||
|
assert_almost_equal(pval,0.18145819972867083,11)
|
||||||
|
|
||||||
|
def test_exact(self):
|
||||||
|
W,pval = stats.ansari([1,2,3,4],[15,5,20,8,10,12])
|
||||||
|
assert_almost_equal(W,10.0,11)
|
||||||
|
assert_almost_equal(pval,0.533333333333333333,7)
|
||||||
|
|
||||||
|
def test_bad_arg(self):
|
||||||
|
assert_raises(ValueError, stats.ansari, [], [1])
|
||||||
|
assert_raises(ValueError, stats.ansari, [1], [])
|
||||||
|
|
||||||
|
|
||||||
|
class TestBartlett(TestCase):
|
||||||
|
|
||||||
|
def test_data(self):
|
||||||
|
args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10]
|
||||||
|
T, pval = stats.bartlett(*args)
|
||||||
|
assert_almost_equal(T,20.78587342806484,7)
|
||||||
|
assert_almost_equal(pval,0.0136358632781,7)
|
||||||
|
|
||||||
|
def test_bad_arg(self):
|
||||||
|
# Too few args raises ValueError.
|
||||||
|
assert_raises(ValueError, stats.bartlett, [1])
|
||||||
|
|
||||||
|
|
||||||
|
class TestLevene(TestCase):
|
||||||
|
|
||||||
|
def test_data(self):
|
||||||
|
args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10]
|
||||||
|
W, pval = stats.levene(*args)
|
||||||
|
assert_almost_equal(W,1.7059176930008939,7)
|
||||||
|
assert_almost_equal(pval,0.0990829755522,7)
|
||||||
|
|
||||||
|
def test_trimmed1(self):
|
||||||
|
# Test that center='trimmed' gives the same result as center='mean'
|
||||||
|
# when proportiontocut=0.
|
||||||
|
W1, pval1 = stats.levene(g1, g2, g3, center='mean')
|
||||||
|
W2, pval2 = stats.levene(g1, g2, g3, center='trimmed', proportiontocut=0.0)
|
||||||
|
assert_almost_equal(W1, W2)
|
||||||
|
assert_almost_equal(pval1, pval2)
|
||||||
|
|
||||||
|
def test_trimmed2(self):
|
||||||
|
x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0]
|
||||||
|
y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0]
|
||||||
|
np.random.seed(1234)
|
||||||
|
x2 = np.random.permutation(x)
|
||||||
|
|
||||||
|
# Use center='trimmed'
|
||||||
|
W0, pval0 = stats.levene(x, y, center='trimmed', proportiontocut=0.125)
|
||||||
|
W1, pval1 = stats.levene(x2, y, center='trimmed', proportiontocut=0.125)
|
||||||
|
# Trim the data here, and use center='mean'
|
||||||
|
W2, pval2 = stats.levene(x[1:-1], y[1:-1], center='mean')
|
||||||
|
# Result should be the same.
|
||||||
|
assert_almost_equal(W0, W2)
|
||||||
|
assert_almost_equal(W1, W2)
|
||||||
|
assert_almost_equal(pval1, pval2)
|
||||||
|
|
||||||
|
def test_equal_mean_median(self):
|
||||||
|
x = np.linspace(-1,1,21)
|
||||||
|
np.random.seed(1234)
|
||||||
|
x2 = np.random.permutation(x)
|
||||||
|
y = x**3
|
||||||
|
W1, pval1 = stats.levene(x, y, center='mean')
|
||||||
|
W2, pval2 = stats.levene(x2, y, center='median')
|
||||||
|
assert_almost_equal(W1, W2)
|
||||||
|
assert_almost_equal(pval1, pval2)
|
||||||
|
|
||||||
|
def test_bad_keyword(self):
|
||||||
|
x = np.linspace(-1,1,21)
|
||||||
|
assert_raises(TypeError, stats.levene, x, x, portiontocut=0.1)
|
||||||
|
|
||||||
|
def test_bad_center_value(self):
|
||||||
|
x = np.linspace(-1,1,21)
|
||||||
|
assert_raises(ValueError, stats.levene, x, x, center='trim')
|
||||||
|
|
||||||
|
def test_too_few_args(self):
|
||||||
|
assert_raises(ValueError, stats.levene, [1])
|
||||||
|
|
||||||
|
|
||||||
|
class TestBinomP(TestCase):
|
||||||
|
|
||||||
|
def test_data(self):
|
||||||
|
pval = stats.binom_test(100,250)
|
||||||
|
assert_almost_equal(pval,0.0018833009350757682,11)
|
||||||
|
pval = stats.binom_test(201,405)
|
||||||
|
assert_almost_equal(pval,0.92085205962670713,11)
|
||||||
|
pval = stats.binom_test([682,243],p=3.0/4)
|
||||||
|
assert_almost_equal(pval,0.38249155957481695,11)
|
||||||
|
|
||||||
|
def test_bad_len_x(self):
|
||||||
|
# Length of x must be 1 or 2.
|
||||||
|
assert_raises(ValueError, stats.binom_test, [1,2,3])
|
||||||
|
|
||||||
|
def test_bad_n(self):
|
||||||
|
# len(x) is 1, but n is invalid.
|
||||||
|
# Missing n
|
||||||
|
assert_raises(ValueError, stats.binom_test, [100])
|
||||||
|
# n less than x[0]
|
||||||
|
assert_raises(ValueError, stats.binom_test, [100], n=50)
|
||||||
|
|
||||||
|
def test_bad_p(self):
|
||||||
|
assert_raises(ValueError, stats.binom_test, [50, 50], p=2.0)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFindRepeats(TestCase):
|
||||||
|
|
||||||
|
def test_basic(self):
|
||||||
|
a = [1,2,3,4,1,2,3,4,1,2,5]
|
||||||
|
res,nums = stats.find_repeats(a)
|
||||||
|
assert_array_equal(res,[1,2,3,4])
|
||||||
|
assert_array_equal(nums,[3,3,2,2])
|
||||||
|
|
||||||
|
def test_empty_result(self):
|
||||||
|
# Check that empty arrays are returned when there are no repeats.
|
||||||
|
a = [10, 20, 50, 30, 40]
|
||||||
|
repeated, counts = stats.find_repeats(a)
|
||||||
|
assert_array_equal(repeated, [])
|
||||||
|
assert_array_equal(counts, [])
|
||||||
|
|
||||||
|
|
||||||
|
class TestFligner(TestCase):
|
||||||
|
|
||||||
|
def test_data(self):
|
||||||
|
# numbers from R: fligner.test in package stats
|
||||||
|
x1 = np.arange(5)
|
||||||
|
assert_array_almost_equal(stats.fligner(x1,x1**2),
|
||||||
|
(3.2282229927203536, 0.072379187848207877), 11)
|
||||||
|
|
||||||
|
def test_trimmed1(self):
|
||||||
|
# Test that center='trimmed' gives the same result as center='mean'
|
||||||
|
# when proportiontocut=0.
|
||||||
|
Xsq1, pval1 = stats.fligner(g1, g2, g3, center='mean')
|
||||||
|
Xsq2, pval2 = stats.fligner(g1, g2, g3, center='trimmed', proportiontocut=0.0)
|
||||||
|
assert_almost_equal(Xsq1, Xsq2)
|
||||||
|
assert_almost_equal(pval1, pval2)
|
||||||
|
|
||||||
|
def test_trimmed2(self):
|
||||||
|
x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0]
|
||||||
|
y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0]
|
||||||
|
# Use center='trimmed'
|
||||||
|
Xsq1, pval1 = stats.fligner(x, y, center='trimmed', proportiontocut=0.125)
|
||||||
|
# Trim the data here, and use center='mean'
|
||||||
|
Xsq2, pval2 = stats.fligner(x[1:-1], y[1:-1], center='mean')
|
||||||
|
# Result should be the same.
|
||||||
|
assert_almost_equal(Xsq1, Xsq2)
|
||||||
|
assert_almost_equal(pval1, pval2)
|
||||||
|
|
||||||
|
# The following test looks reasonable at first, but fligner() uses the
|
||||||
|
# function stats.rankdata(), and in one of the cases in this test,
|
||||||
|
# there are ties, while in the other (because of normal rounding
|
||||||
|
# errors) there are not. This difference leads to differences in the
|
||||||
|
# third significant digit of W.
|
||||||
|
#
|
||||||
|
#def test_equal_mean_median(self):
|
||||||
|
# x = np.linspace(-1,1,21)
|
||||||
|
# y = x**3
|
||||||
|
# W1, pval1 = stats.fligner(x, y, center='mean')
|
||||||
|
# W2, pval2 = stats.fligner(x, y, center='median')
|
||||||
|
# assert_almost_equal(W1, W2)
|
||||||
|
# assert_almost_equal(pval1, pval2)
|
||||||
|
|
||||||
|
def test_bad_keyword(self):
|
||||||
|
x = np.linspace(-1,1,21)
|
||||||
|
assert_raises(TypeError, stats.fligner, x, x, portiontocut=0.1)
|
||||||
|
|
||||||
|
def test_bad_center_value(self):
|
||||||
|
x = np.linspace(-1,1,21)
|
||||||
|
assert_raises(ValueError, stats.fligner, x, x, center='trim')
|
||||||
|
|
||||||
|
def test_bad_num_args(self):
|
||||||
|
# Too few args raises ValueError.
|
||||||
|
assert_raises(ValueError, stats.fligner, [1])
|
||||||
|
|
||||||
|
|
||||||
|
class TestMood(TestCase):
|
||||||
|
def test_mood(self):
|
||||||
|
# numbers from R: mood.test in package stats
|
||||||
|
x1 = np.arange(5)
|
||||||
|
assert_array_almost_equal(stats.mood(x1, x1**2),
|
||||||
|
(-1.3830857299399906, 0.16663858066771478), 11)
|
||||||
|
|
||||||
|
def test_mood_order_of_args(self):
|
||||||
|
# z should change sign when the order of arguments changes, pvalue
|
||||||
|
# should not change
|
||||||
|
np.random.seed(1234)
|
||||||
|
x1 = np.random.randn(10, 1)
|
||||||
|
x2 = np.random.randn(15, 1)
|
||||||
|
z1, p1 = stats.mood(x1, x2)
|
||||||
|
z2, p2 = stats.mood(x2, x1)
|
||||||
|
assert_array_almost_equal([z1, p1], [-z2, p2])
|
||||||
|
|
||||||
|
def test_mood_with_axis_none(self):
|
||||||
|
#Test with axis = None, compare with results from R
|
||||||
|
x1 = [-0.626453810742332, 0.183643324222082, -0.835628612410047,
|
||||||
|
1.59528080213779, 0.329507771815361, -0.820468384118015,
|
||||||
|
0.487429052428485, 0.738324705129217, 0.575781351653492,
|
||||||
|
-0.305388387156356, 1.51178116845085, 0.389843236411431,
|
||||||
|
-0.621240580541804, -2.2146998871775, 1.12493091814311,
|
||||||
|
-0.0449336090152309, -0.0161902630989461, 0.943836210685299,
|
||||||
|
0.821221195098089, 0.593901321217509]
|
||||||
|
|
||||||
|
x2 = [-0.896914546624981, 0.184849184646742, 1.58784533120882,
|
||||||
|
-1.13037567424629, -0.0802517565509893, 0.132420284381094,
|
||||||
|
0.707954729271733, -0.23969802417184, 1.98447393665293,
|
||||||
|
-0.138787012119665, 0.417650750792556, 0.981752777463662,
|
||||||
|
-0.392695355503813, -1.03966897694891, 1.78222896030858,
|
||||||
|
-2.31106908460517, 0.878604580921265, 0.035806718015226,
|
||||||
|
1.01282869212708, 0.432265154539617, 2.09081920524915,
|
||||||
|
-1.19992581964387, 1.58963820029007, 1.95465164222325,
|
||||||
|
0.00493777682814261, -2.45170638784613, 0.477237302613617,
|
||||||
|
-0.596558168631403, 0.792203270299649, 0.289636710177348]
|
||||||
|
|
||||||
|
x1 = np.array(x1)
|
||||||
|
x2 = np.array(x2)
|
||||||
|
x1.shape = (10, 2)
|
||||||
|
x2.shape = (15, 2)
|
||||||
|
assert_array_almost_equal(stats.mood(x1, x2, axis=None),
|
||||||
|
[-1.31716607555, 0.18778296257])
|
||||||
|
|
||||||
|
def test_mood_2d(self):
|
||||||
|
# Test if the results of mood test in 2-D case are consistent with the
|
||||||
|
# R result for the same inputs. Numbers from R mood.test().
|
||||||
|
ny = 5
|
||||||
|
np.random.seed(1234)
|
||||||
|
x1 = np.random.randn(10, ny)
|
||||||
|
x2 = np.random.randn(15, ny)
|
||||||
|
z_vectest, pval_vectest = stats.mood(x1, x2)
|
||||||
|
|
||||||
|
for j in range(ny):
|
||||||
|
assert_array_almost_equal([z_vectest[j], pval_vectest[j]],
|
||||||
|
stats.mood(x1[:, j], x2[:, j]))
|
||||||
|
|
||||||
|
# inverse order of dimensions
|
||||||
|
x1 = x1.transpose()
|
||||||
|
x2 = x2.transpose()
|
||||||
|
z_vectest, pval_vectest = stats.mood(x1, x2, axis=1)
|
||||||
|
|
||||||
|
for i in range(ny):
|
||||||
|
# check axis handling is self consistent
|
||||||
|
assert_array_almost_equal([z_vectest[i], pval_vectest[i]],
|
||||||
|
stats.mood(x1[i, :], x2[i, :]))
|
||||||
|
|
||||||
|
def test_mood_3d(self):
|
||||||
|
shape = (10, 5, 6)
|
||||||
|
np.random.seed(1234)
|
||||||
|
x1 = np.random.randn(*shape)
|
||||||
|
x2 = np.random.randn(*shape)
|
||||||
|
|
||||||
|
for axis in range(3):
|
||||||
|
z_vectest, pval_vectest = stats.mood(x1, x2, axis=axis)
|
||||||
|
# Tests that result for 3-D arrays is equal to that for the
|
||||||
|
# same calculation on a set of 1-D arrays taken from the
|
||||||
|
# 3-D array
|
||||||
|
axes_idx = ([1, 2], [0, 2], [0, 1]) # the two axes != axis
|
||||||
|
for i in range(shape[axes_idx[axis][0]]):
|
||||||
|
for j in range(shape[axes_idx[axis][1]]):
|
||||||
|
if axis == 0:
|
||||||
|
slice1 = x1[:, i, j]
|
||||||
|
slice2 = x2[:, i, j]
|
||||||
|
elif axis == 1:
|
||||||
|
slice1 = x1[i, :, j]
|
||||||
|
slice2 = x2[i, :, j]
|
||||||
|
else:
|
||||||
|
slice1 = x1[i, j, :]
|
||||||
|
slice2 = x2[i, j, :]
|
||||||
|
|
||||||
|
assert_array_almost_equal([z_vectest[i, j],
|
||||||
|
pval_vectest[i, j]],
|
||||||
|
stats.mood(slice1, slice2))
|
||||||
|
|
||||||
|
def test_mood_bad_arg(self):
|
||||||
|
# Raise ValueError when the sum of the lengths of the args is less than 3
|
||||||
|
assert_raises(ValueError, stats.mood, [1], [])
|
||||||
|
|
||||||
|
|
||||||
|
class TestProbplot(TestCase):
|
||||||
|
|
||||||
|
def test_basic(self):
|
||||||
|
np.random.seed(12345)
|
||||||
|
x = stats.norm.rvs(size=20)
|
||||||
|
osm, osr = stats.probplot(x, fit=False)
|
||||||
|
osm_expected = [-1.8241636, -1.38768012, -1.11829229, -0.91222575,
|
||||||
|
-0.73908135, -0.5857176, -0.44506467, -0.31273668,
|
||||||
|
-0.18568928, -0.06158146, 0.06158146, 0.18568928,
|
||||||
|
0.31273668, 0.44506467, 0.5857176, 0.73908135,
|
||||||
|
0.91222575, 1.11829229, 1.38768012, 1.8241636]
|
||||||
|
assert_allclose(osr, np.sort(x))
|
||||||
|
assert_allclose(osm, osm_expected)
|
||||||
|
|
||||||
|
res, res_fit = stats.probplot(x, fit=True)
|
||||||
|
res_fit_expected = [1.05361841, 0.31297795, 0.98741609]
|
||||||
|
assert_allclose(res_fit, res_fit_expected)
|
||||||
|
|
||||||
|
def test_sparams_keyword(self):
|
||||||
|
np.random.seed(123456)
|
||||||
|
x = stats.norm.rvs(size=100)
|
||||||
|
# Check that None, () and 0 (loc=0, for normal distribution) all work
|
||||||
|
# and give the same results
|
||||||
|
osm1, osr1 = stats.probplot(x, sparams=None, fit=False)
|
||||||
|
osm2, osr2 = stats.probplot(x, sparams=0, fit=False)
|
||||||
|
osm3, osr3 = stats.probplot(x, sparams=(), fit=False)
|
||||||
|
assert_allclose(osm1, osm2)
|
||||||
|
assert_allclose(osm1, osm3)
|
||||||
|
assert_allclose(osr1, osr2)
|
||||||
|
assert_allclose(osr1, osr3)
|
||||||
|
# Check giving (loc, scale) params for normal distribution
|
||||||
|
osm, osr = stats.probplot(x, sparams=(), fit=False)
|
||||||
|
|
||||||
|
def test_dist_keyword(self):
|
||||||
|
np.random.seed(12345)
|
||||||
|
x = stats.norm.rvs(size=20)
|
||||||
|
osm1, osr1 = stats.probplot(x, fit=False, dist='t', sparams=(3,))
|
||||||
|
osm2, osr2 = stats.probplot(x, fit=False, dist=stats.t, sparams=(3,))
|
||||||
|
assert_allclose(osm1, osm2)
|
||||||
|
assert_allclose(osr1, osr2)
|
||||||
|
|
||||||
|
assert_raises(ValueError, stats.probplot, x, dist='wrong-dist-name')
|
||||||
|
assert_raises(AttributeError, stats.probplot, x, dist=[])
|
||||||
|
|
||||||
|
class custom_dist(object):
|
||||||
|
"""Some class that looks just enough like a distribution."""
|
||||||
|
def ppf(self, q):
|
||||||
|
return stats.norm.ppf(q, loc=2)
|
||||||
|
|
||||||
|
osm1, osr1 = stats.probplot(x, sparams=(2,), fit=False)
|
||||||
|
osm2, osr2 = stats.probplot(x, dist=custom_dist(), fit=False)
|
||||||
|
assert_allclose(osm1, osm2)
|
||||||
|
assert_allclose(osr1, osr2)
|
||||||
|
|
||||||
|
@dec.skipif(not have_matplotlib)
|
||||||
|
def test_plot_kwarg(self):
|
||||||
|
np.random.seed(7654321)
|
||||||
|
fig = plt.figure()
|
||||||
|
fig.add_subplot(111)
|
||||||
|
x = stats.t.rvs(3, size=100)
|
||||||
|
res1, fitres1 = stats.probplot(x, plot=plt)
|
||||||
|
plt.close()
|
||||||
|
res2, fitres2 = stats.probplot(x, plot=None)
|
||||||
|
res3 = stats.probplot(x, fit=False, plot=plt)
|
||||||
|
plt.close()
|
||||||
|
res4 = stats.probplot(x, fit=False, plot=None)
|
||||||
|
# Check that results are consistent between combinations of `fit` and
|
||||||
|
# `plot` keywords.
|
||||||
|
assert_(len(res1) == len(res2) == len(res3) == len(res4) == 2)
|
||||||
|
assert_allclose(res1, res2)
|
||||||
|
assert_allclose(res1, res3)
|
||||||
|
assert_allclose(res1, res4)
|
||||||
|
assert_allclose(fitres1, fitres2)
|
||||||
|
|
||||||
|
# Check that a Matplotlib Axes object is accepted
|
||||||
|
fig = plt.figure()
|
||||||
|
ax = fig.add_subplot(111)
|
||||||
|
stats.probplot(x, fit=False, plot=ax)
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
def test_probplot_bad_args(self):
|
||||||
|
# Raise ValueError when given an invalid distribution.
|
||||||
|
assert_raises(ValueError, stats.probplot, [1], dist="plate_of_shrimp")
|
||||||
|
|
||||||
|
|
||||||
|
def test_wilcoxon_bad_arg():
|
||||||
|
# Raise ValueError when two args of different lengths are given or
|
||||||
|
# zero_method is unknown.
|
||||||
|
assert_raises(ValueError, stats.wilcoxon, [1], [1,2])
|
||||||
|
assert_raises(ValueError, stats.wilcoxon, [1,2], [1,2], "dummy")
|
||||||
|
|
||||||
|
|
||||||
|
def test_mvsdist_bad_arg():
|
||||||
|
# Raise ValueError if fewer than two data points are given.
|
||||||
|
data = [1]
|
||||||
|
assert_raises(ValueError, stats.mvsdist, data)
|
||||||
|
|
||||||
|
|
||||||
|
def test_kstat_bad_arg():
|
||||||
|
# Raise ValueError if n > 4 or n > 1.
|
||||||
|
data = [1]
|
||||||
|
n = 10
|
||||||
|
assert_raises(ValueError, stats.kstat, data, n=n)
|
||||||
|
|
||||||
|
|
||||||
|
def test_kstatvar_bad_arg():
|
||||||
|
# Raise ValueError is n is not 1 or 2.
|
||||||
|
data = [1]
|
||||||
|
n = 10
|
||||||
|
assert_raises(ValueError, stats.kstatvar, data, n=n)
|
||||||
|
|
||||||
|
|
||||||
|
def test_ppcc_max_bad_arg():
|
||||||
|
# Raise ValueError when given an invalid distribution.
|
||||||
|
data = [1]
|
||||||
|
assert_raises(ValueError, stats.ppcc_max, data, dist="plate_of_shrimp")
|
||||||
|
|
||||||
|
|
||||||
|
class TestBoxcox_llf(TestCase):
|
||||||
|
|
||||||
|
def test_basic(self):
|
||||||
|
np.random.seed(54321)
|
||||||
|
x = stats.norm.rvs(size=10000, loc=10)
|
||||||
|
lmbda = 1
|
||||||
|
llf = stats.boxcox_llf(lmbda, x)
|
||||||
|
llf_expected = -x.size / 2. * np.log(np.sum(x.std()**2))
|
||||||
|
assert_allclose(llf, llf_expected)
|
||||||
|
|
||||||
|
def test_array_like(self):
|
||||||
|
np.random.seed(54321)
|
||||||
|
x = stats.norm.rvs(size=100, loc=10)
|
||||||
|
lmbda = 1
|
||||||
|
llf = stats.boxcox_llf(lmbda, x)
|
||||||
|
llf2 = stats.boxcox_llf(lmbda, list(x))
|
||||||
|
assert_allclose(llf, llf2, rtol=1e-12)
|
||||||
|
|
||||||
|
def test_2d_input(self):
|
||||||
|
# Note: boxcox_llf() was already working with 2-D input (sort of), so
|
||||||
|
# keep it like that. boxcox() doesn't work with 2-D input though, due
|
||||||
|
# to brent() returning a scalar.
|
||||||
|
np.random.seed(54321)
|
||||||
|
x = stats.norm.rvs(size=100, loc=10)
|
||||||
|
lmbda = 1
|
||||||
|
llf = stats.boxcox_llf(lmbda, x)
|
||||||
|
llf2 = stats.boxcox_llf(lmbda, np.vstack([x, x]).T)
|
||||||
|
assert_allclose([llf, llf], llf2, rtol=1e-12)
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert_(np.isnan(stats.boxcox_llf(1, [])))
|
||||||
|
|
||||||
|
|
||||||
|
class TestBoxcox(TestCase):
|
||||||
|
|
||||||
|
def test_fixed_lmbda(self):
|
||||||
|
np.random.seed(12345)
|
||||||
|
x = stats.loggamma.rvs(5, size=50) + 5
|
||||||
|
xt = stats.boxcox(x, lmbda=1)
|
||||||
|
assert_allclose(xt, x - 1)
|
||||||
|
xt = stats.boxcox(x, lmbda=-1)
|
||||||
|
assert_allclose(xt, 1 - 1/x)
|
||||||
|
|
||||||
|
xt = stats.boxcox(x, lmbda=0)
|
||||||
|
assert_allclose(xt, np.log(x))
|
||||||
|
|
||||||
|
# Also test that array_like input works
|
||||||
|
xt = stats.boxcox(list(x), lmbda=0)
|
||||||
|
assert_allclose(xt, np.log(x))
|
||||||
|
|
||||||
|
def test_lmbda_None(self):
|
||||||
|
np.random.seed(1234567)
|
||||||
|
# Start from normal rv's, do inverse transform to check that
|
||||||
|
# optimization function gets close to the right answer.
|
||||||
|
np.random.seed(1245)
|
||||||
|
lmbda = 2.5
|
||||||
|
x = stats.norm.rvs(loc=10, size=50000)
|
||||||
|
x_inv = (x * lmbda + 1)**(-lmbda)
|
||||||
|
xt, maxlog = stats.boxcox(x_inv)
|
||||||
|
|
||||||
|
assert_almost_equal(maxlog, -1 / lmbda, decimal=2)
|
||||||
|
|
||||||
|
def test_alpha(self):
|
||||||
|
np.random.seed(1234)
|
||||||
|
x = stats.loggamma.rvs(5, size=50) + 5
|
||||||
|
|
||||||
|
# Some regular values for alpha, on a small sample size
|
||||||
|
_, _, interval = stats.boxcox(x, alpha=0.75)
|
||||||
|
assert_allclose(interval, [4.004485780226041, 5.138756355035744])
|
||||||
|
_, _, interval = stats.boxcox(x, alpha=0.05)
|
||||||
|
assert_allclose(interval, [1.2138178554857557, 8.209033272375663])
|
||||||
|
|
||||||
|
# Try some extreme values, see we don't hit the N=500 limit
|
||||||
|
x = stats.loggamma.rvs(7, size=500) + 15
|
||||||
|
_, _, interval = stats.boxcox(x, alpha=0.001)
|
||||||
|
assert_allclose(interval, [0.3988867, 11.40553131])
|
||||||
|
_, _, interval = stats.boxcox(x, alpha=0.999)
|
||||||
|
assert_allclose(interval, [5.83316246, 5.83735292])
|
||||||
|
|
||||||
|
def test_boxcox_bad_arg(self):
|
||||||
|
# Raise ValueError if any data value is negative.
|
||||||
|
x = np.array([-1])
|
||||||
|
assert_raises(ValueError, stats.boxcox, x)
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert_(stats.boxcox([]).shape == (0,))
|
||||||
|
|
||||||
|
|
||||||
|
class TestBoxcoxNormmax(TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
np.random.seed(12345)
|
||||||
|
self.x = stats.loggamma.rvs(5, size=50) + 5
|
||||||
|
|
||||||
|
def test_pearsonr(self):
|
||||||
|
maxlog = stats.boxcox_normmax(self.x)
|
||||||
|
assert_allclose(maxlog, 1.804465325046)
|
||||||
|
|
||||||
|
def test_mle(self):
|
||||||
|
maxlog = stats.boxcox_normmax(self.x, method='mle')
|
||||||
|
assert_allclose(maxlog, 1.758101454114)
|
||||||
|
|
||||||
|
# Check that boxcox() uses 'mle'
|
||||||
|
_, maxlog_boxcox = stats.boxcox(self.x)
|
||||||
|
assert_allclose(maxlog_boxcox, maxlog)
|
||||||
|
|
||||||
|
def test_all(self):
|
||||||
|
maxlog_all = stats.boxcox_normmax(self.x, method='all')
|
||||||
|
assert_allclose(maxlog_all, [1.804465325046, 1.758101454114])
|
||||||
|
|
||||||
|
|
||||||
|
class TestBoxcoxNormplot(TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
np.random.seed(7654321)
|
||||||
|
self.x = stats.loggamma.rvs(5, size=500) + 5
|
||||||
|
|
||||||
|
def test_basic(self):
|
||||||
|
N = 5
|
||||||
|
lmbdas, ppcc = stats.boxcox_normplot(self.x, -10, 10, N=N)
|
||||||
|
ppcc_expected = [0.57783375, 0.83610988, 0.97524311, 0.99756057,
|
||||||
|
0.95843297]
|
||||||
|
assert_allclose(lmbdas, np.linspace(-10, 10, num=N))
|
||||||
|
assert_allclose(ppcc, ppcc_expected)
|
||||||
|
|
||||||
|
@dec.skipif(not have_matplotlib)
|
||||||
|
def test_plot_kwarg(self):
|
||||||
|
# Check with the matplotlib.pyplot module
|
||||||
|
fig = plt.figure()
|
||||||
|
fig.add_subplot(111)
|
||||||
|
stats.boxcox_normplot(self.x, -20, 20, plot=plt)
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# Check that a Matplotlib Axes object is accepted
|
||||||
|
fig.add_subplot(111)
|
||||||
|
ax = fig.add_subplot(111)
|
||||||
|
stats.boxcox_normplot(self.x, -20, 20, plot=ax)
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
def test_invalid_inputs(self):
|
||||||
|
# `lb` has to be larger than `la`
|
||||||
|
assert_raises(ValueError, stats.boxcox_normplot, self.x, 1, 0)
|
||||||
|
# `x` can not contain negative values
|
||||||
|
assert_raises(ValueError, stats.boxcox_normplot, [-1, 1] , 0, 1)
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert_(stats.boxcox_normplot([], 0, 1).size == 0)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCircFuncs(TestCase):
|
||||||
|
def test_circfuncs(self):
|
||||||
|
x = np.array([355,5,2,359,10,350])
|
||||||
|
M = stats.circmean(x, high=360)
|
||||||
|
Mval = 0.167690146
|
||||||
|
assert_allclose(M, Mval, rtol=1e-7)
|
||||||
|
|
||||||
|
V = stats.circvar(x, high=360)
|
||||||
|
Vval = 42.51955609
|
||||||
|
assert_allclose(V, Vval, rtol=1e-7)
|
||||||
|
|
||||||
|
S = stats.circstd(x, high=360)
|
||||||
|
Sval = 6.520702116
|
||||||
|
assert_allclose(S, Sval, rtol=1e-7)
|
||||||
|
|
||||||
|
def test_circfuncs_small(self):
|
||||||
|
x = np.array([20,21,22,18,19,20.5,19.2])
|
||||||
|
M1 = x.mean()
|
||||||
|
M2 = stats.circmean(x, high=360)
|
||||||
|
assert_allclose(M2, M1, rtol=1e-5)
|
||||||
|
|
||||||
|
V1 = x.var()
|
||||||
|
V2 = stats.circvar(x, high=360)
|
||||||
|
assert_allclose(V2, V1, rtol=1e-4)
|
||||||
|
|
||||||
|
S1 = x.std()
|
||||||
|
S2 = stats.circstd(x, high=360)
|
||||||
|
assert_allclose(S2, S1, rtol=1e-4)
|
||||||
|
|
||||||
|
def test_circmean_axis(self):
|
||||||
|
x = np.array([[355,5,2,359,10,350],
|
||||||
|
[351,7,4,352,9,349],
|
||||||
|
[357,9,8,358,4,356]])
|
||||||
|
M1 = stats.circmean(x, high=360)
|
||||||
|
M2 = stats.circmean(x.ravel(), high=360)
|
||||||
|
assert_allclose(M1, M2, rtol=1e-14)
|
||||||
|
|
||||||
|
M1 = stats.circmean(x, high=360, axis=1)
|
||||||
|
M2 = [stats.circmean(x[i], high=360) for i in range(x.shape[0])]
|
||||||
|
assert_allclose(M1, M2, rtol=1e-14)
|
||||||
|
|
||||||
|
M1 = stats.circmean(x, high=360, axis=0)
|
||||||
|
M2 = [stats.circmean(x[:,i], high=360) for i in range(x.shape[1])]
|
||||||
|
assert_allclose(M1, M2, rtol=1e-14)
|
||||||
|
|
||||||
|
def test_circvar_axis(self):
|
||||||
|
x = np.array([[355,5,2,359,10,350],
|
||||||
|
[351,7,4,352,9,349],
|
||||||
|
[357,9,8,358,4,356]])
|
||||||
|
|
||||||
|
V1 = stats.circvar(x, high=360)
|
||||||
|
V2 = stats.circvar(x.ravel(), high=360)
|
||||||
|
assert_allclose(V1, V2, rtol=1e-11)
|
||||||
|
|
||||||
|
V1 = stats.circvar(x, high=360, axis=1)
|
||||||
|
V2 = [stats.circvar(x[i], high=360) for i in range(x.shape[0])]
|
||||||
|
assert_allclose(V1, V2, rtol=1e-11)
|
||||||
|
|
||||||
|
V1 = stats.circvar(x, high=360, axis=0)
|
||||||
|
V2 = [stats.circvar(x[:,i], high=360) for i in range(x.shape[1])]
|
||||||
|
assert_allclose(V1, V2, rtol=1e-11)
|
||||||
|
|
||||||
|
def test_circstd_axis(self):
|
||||||
|
x = np.array([[355,5,2,359,10,350],
|
||||||
|
[351,7,4,352,9,349],
|
||||||
|
[357,9,8,358,4,356]])
|
||||||
|
|
||||||
|
S1 = stats.circstd(x, high=360)
|
||||||
|
S2 = stats.circstd(x.ravel(), high=360)
|
||||||
|
assert_allclose(S1, S2, rtol=1e-11)
|
||||||
|
|
||||||
|
S1 = stats.circstd(x, high=360, axis=1)
|
||||||
|
S2 = [stats.circstd(x[i], high=360) for i in range(x.shape[0])]
|
||||||
|
assert_allclose(S1, S2, rtol=1e-11)
|
||||||
|
|
||||||
|
S1 = stats.circstd(x, high=360, axis=0)
|
||||||
|
S2 = [stats.circstd(x[:,i], high=360) for i in range(x.shape[1])]
|
||||||
|
assert_allclose(S1, S2, rtol=1e-11)
|
||||||
|
|
||||||
|
def test_circfuncs_array_like(self):
|
||||||
|
x = [355,5,2,359,10,350]
|
||||||
|
assert_allclose(stats.circmean(x, high=360), 0.167690146, rtol=1e-7)
|
||||||
|
assert_allclose(stats.circvar(x, high=360), 42.51955609, rtol=1e-7)
|
||||||
|
assert_allclose(stats.circstd(x, high=360), 6.520702116, rtol=1e-7)
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert_(np.isnan(stats.circmean([])))
|
||||||
|
assert_(np.isnan(stats.circstd([])))
|
||||||
|
assert_(np.isnan(stats.circvar([])))
|
||||||
|
|
||||||
|
|
||||||
|
def test_accuracy_wilcoxon():
|
||||||
|
freq = [1, 4, 16, 15, 8, 4, 5, 1, 2]
|
||||||
|
nums = range(-4, 5)
|
||||||
|
x = np.concatenate([[u] * v for u, v in zip(nums, freq)])
|
||||||
|
y = np.zeros(x.size)
|
||||||
|
|
||||||
|
T, p = stats.wilcoxon(x, y, "pratt")
|
||||||
|
assert_allclose(T, 423)
|
||||||
|
assert_allclose(p, 0.00197547303533107)
|
||||||
|
|
||||||
|
T, p = stats.wilcoxon(x, y, "zsplit")
|
||||||
|
assert_allclose(T, 441)
|
||||||
|
assert_allclose(p, 0.0032145343172473055)
|
||||||
|
|
||||||
|
T, p = stats.wilcoxon(x, y, "wilcox")
|
||||||
|
assert_allclose(T, 327)
|
||||||
|
assert_allclose(p, 0.00641346115861)
|
||||||
|
|
||||||
|
# Test the 'correction' option, using values computed in R with:
|
||||||
|
# > wilcox.test(x, y, paired=TRUE, exact=FALSE, correct={FALSE,TRUE})
|
||||||
|
x = np.array([120, 114, 181, 188, 180, 146, 121, 191, 132, 113, 127, 112])
|
||||||
|
y = np.array([133, 143, 119, 189, 112, 199, 198, 113, 115, 121, 142, 187])
|
||||||
|
T, p = stats.wilcoxon(x, y, correction=False)
|
||||||
|
assert_equal(T, 34)
|
||||||
|
assert_allclose(p, 0.6948866, rtol=1e-6)
|
||||||
|
T, p = stats.wilcoxon(x, y, correction=True)
|
||||||
|
assert_equal(T, 34)
|
||||||
|
assert_allclose(p, 0.7240817, rtol=1e-6)
|
||||||
|
|
||||||
|
|
||||||
|
def test_wilcoxon_tie():
|
||||||
|
# Regression test for gh-2391.
|
||||||
|
# Corresponding R code is:
|
||||||
|
# > result = wilcox.test(rep(0.1, 10), exact=FALSE, correct=FALSE)
|
||||||
|
# > result$p.value
|
||||||
|
# [1] 0.001565402
|
||||||
|
# > result = wilcox.test(rep(0.1, 10), exact=FALSE, correct=TRUE)
|
||||||
|
# > result$p.value
|
||||||
|
# [1] 0.001904195
|
||||||
|
stat, p = stats.wilcoxon([0.1] * 10)
|
||||||
|
expected_p = 0.001565402
|
||||||
|
assert_equal(stat, 0)
|
||||||
|
assert_allclose(p, expected_p, rtol=1e-6)
|
||||||
|
|
||||||
|
stat, p = stats.wilcoxon([0.1] * 10, correction=True)
|
||||||
|
expected_p = 0.001904195
|
||||||
|
assert_equal(stat, 0)
|
||||||
|
assert_allclose(p, expected_p, rtol=1e-6)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_module_suite()
|
@ -0,0 +1,275 @@
|
|||||||
|
"""
|
||||||
|
Test functions for multivariate normal distributions.
|
||||||
|
|
||||||
|
"""
|
||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
from numpy.testing import (assert_almost_equal,
|
||||||
|
run_module_suite, assert_allclose, assert_equal, assert_raises)
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import scipy.linalg
|
||||||
|
import scipy.stats._multivariate
|
||||||
|
from scipy.stats import multivariate_normal
|
||||||
|
from scipy.stats import norm
|
||||||
|
|
||||||
|
from scipy.stats._multivariate import _psd_pinv_decomposed_log_pdet
|
||||||
|
|
||||||
|
from scipy.integrate import romb
|
||||||
|
|
||||||
|
|
||||||
|
def test_scalar_values():
|
||||||
|
np.random.seed(1234)
|
||||||
|
|
||||||
|
# When evaluated on scalar data, the pdf should return a scalar
|
||||||
|
x, mean, cov = 1.5, 1.7, 2.5
|
||||||
|
pdf = multivariate_normal.pdf(x, mean, cov)
|
||||||
|
assert_equal(pdf.ndim, 0)
|
||||||
|
|
||||||
|
# When evaluated on a single vector, the pdf should return a scalar
|
||||||
|
x = np.random.randn(5)
|
||||||
|
mean = np.random.randn(5)
|
||||||
|
cov = np.abs(np.random.randn(5)) # Diagonal values for cov. matrix
|
||||||
|
pdf = multivariate_normal.pdf(x, mean, cov)
|
||||||
|
assert_equal(pdf.ndim, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_logpdf():
|
||||||
|
# Check that the log of the pdf is in fact the logpdf
|
||||||
|
np.random.seed(1234)
|
||||||
|
x = np.random.randn(5)
|
||||||
|
mean = np.random.randn(5)
|
||||||
|
cov = np.abs(np.random.randn(5))
|
||||||
|
d1 = multivariate_normal.logpdf(x, mean, cov)
|
||||||
|
d2 = multivariate_normal.pdf(x, mean, cov)
|
||||||
|
assert_allclose(d1, np.log(d2))
|
||||||
|
|
||||||
|
|
||||||
|
def test_large_pseudo_determinant():
|
||||||
|
# Check that large pseudo-determinants are handled appropriately.
|
||||||
|
|
||||||
|
# Construct a singular diagonal covariance matrix
|
||||||
|
# whose pseudo determinant overflows double precision.
|
||||||
|
large_total_log = 1000.0
|
||||||
|
npos = 100
|
||||||
|
nzero = 2
|
||||||
|
large_entry = np.exp(large_total_log / npos)
|
||||||
|
n = npos + nzero
|
||||||
|
cov = np.zeros((n, n), dtype=float)
|
||||||
|
np.fill_diagonal(cov, large_entry)
|
||||||
|
cov[-nzero:, -nzero:] = 0
|
||||||
|
|
||||||
|
# Check some determinants.
|
||||||
|
assert_equal(scipy.linalg.det(cov), 0)
|
||||||
|
assert_equal(scipy.linalg.det(cov[:npos, :npos]), np.inf)
|
||||||
|
|
||||||
|
# np.linalg.slogdet is only available in numpy 1.6+
|
||||||
|
# but scipy currently supports numpy 1.5.1.
|
||||||
|
#assert_allclose(np.linalg.slogdet(cov[:npos, :npos]), (1, large_total_log))
|
||||||
|
|
||||||
|
# Check the pseudo-determinant.
|
||||||
|
U, log_pdet = scipy.stats._multivariate._psd_pinv_decomposed_log_pdet(cov)
|
||||||
|
assert_allclose(log_pdet, large_total_log)
|
||||||
|
|
||||||
|
|
||||||
|
def test_broadcasting():
|
||||||
|
np.random.seed(1234)
|
||||||
|
n = 4
|
||||||
|
|
||||||
|
# Construct a random covariance matrix.
|
||||||
|
data = np.random.randn(n, n)
|
||||||
|
cov = np.dot(data, data.T)
|
||||||
|
mean = np.random.randn(n)
|
||||||
|
|
||||||
|
# Construct an ndarray which can be interpreted as
|
||||||
|
# a 2x3 array whose elements are random data vectors.
|
||||||
|
X = np.random.randn(2, 3, n)
|
||||||
|
|
||||||
|
# Check that multiple data points can be evaluated at once.
|
||||||
|
for i in range(2):
|
||||||
|
for j in range(3):
|
||||||
|
actual = multivariate_normal.pdf(X[i, j], mean, cov)
|
||||||
|
desired = multivariate_normal.pdf(X, mean, cov)[i, j]
|
||||||
|
assert_allclose(actual, desired)
|
||||||
|
|
||||||
|
|
||||||
|
def test_normal_1D():
|
||||||
|
# The probability density function for a 1D normal variable should
|
||||||
|
# agree with the standard normal distribution in scipy.stats.distributions
|
||||||
|
x = np.linspace(0, 2, 10)
|
||||||
|
mean, cov = 1.2, 0.9
|
||||||
|
scale = cov**0.5
|
||||||
|
d1 = norm.pdf(x, mean, scale)
|
||||||
|
d2 = multivariate_normal.pdf(x, mean, cov)
|
||||||
|
assert_allclose(d1, d2)
|
||||||
|
|
||||||
|
|
||||||
|
def test_marginalization():
|
||||||
|
# Integrating out one of the variables of a 2D Gaussian should
|
||||||
|
# yield a 1D Gaussian
|
||||||
|
mean = np.array([2.5, 3.5])
|
||||||
|
cov = np.array([[.5, 0.2], [0.2, .6]])
|
||||||
|
n = 2**8 + 1 # Number of samples
|
||||||
|
delta = 6 / (n - 1) # Grid spacing
|
||||||
|
|
||||||
|
v = np.linspace(0, 6, n)
|
||||||
|
xv, yv = np.meshgrid(v, v)
|
||||||
|
pos = np.empty((n, n, 2))
|
||||||
|
pos[:, :, 0] = xv
|
||||||
|
pos[:, :, 1] = yv
|
||||||
|
pdf = multivariate_normal.pdf(pos, mean, cov)
|
||||||
|
|
||||||
|
# Marginalize over x and y axis
|
||||||
|
margin_x = romb(pdf, delta, axis=0)
|
||||||
|
margin_y = romb(pdf, delta, axis=1)
|
||||||
|
|
||||||
|
# Compare with standard normal distribution
|
||||||
|
gauss_x = norm.pdf(v, loc=mean[0], scale=cov[0, 0]**0.5)
|
||||||
|
gauss_y = norm.pdf(v, loc=mean[1], scale=cov[1, 1]**0.5)
|
||||||
|
assert_allclose(margin_x, gauss_x, rtol=1e-2, atol=1e-2)
|
||||||
|
assert_allclose(margin_y, gauss_y, rtol=1e-2, atol=1e-2)
|
||||||
|
|
||||||
|
|
||||||
|
def test_frozen():
|
||||||
|
# The frozen distribution should agree with the regular one
|
||||||
|
np.random.seed(1234)
|
||||||
|
x = np.random.randn(5)
|
||||||
|
mean = np.random.randn(5)
|
||||||
|
cov = np.abs(np.random.randn(5))
|
||||||
|
norm_frozen = multivariate_normal(mean, cov)
|
||||||
|
assert_allclose(norm_frozen.pdf(x), multivariate_normal.pdf(x, mean, cov))
|
||||||
|
assert_allclose(norm_frozen.logpdf(x),
|
||||||
|
multivariate_normal.logpdf(x, mean, cov))
|
||||||
|
|
||||||
|
|
||||||
|
def test_pseudodet_pinv():
|
||||||
|
# Make sure that pseudo-inverse and pseudo-det agree on cutoff
|
||||||
|
|
||||||
|
# Assemble random covariance matrix with large and small eigenvalues
|
||||||
|
np.random.seed(1234)
|
||||||
|
n = 7
|
||||||
|
x = np.random.randn(n, n)
|
||||||
|
cov = np.dot(x, x.T)
|
||||||
|
s, u = scipy.linalg.eigh(cov)
|
||||||
|
s = 0.5 * np.ones(n)
|
||||||
|
s[0] = 1.0
|
||||||
|
s[-1] = 1e-7
|
||||||
|
cov = np.dot(u, np.dot(np.diag(s), u.T))
|
||||||
|
|
||||||
|
# Set cond so that the lowest eigenvalue is below the cutoff
|
||||||
|
cond = 1e-5
|
||||||
|
U, log_pdet = _psd_pinv_decomposed_log_pdet(cov, cond)
|
||||||
|
pinv = np.dot(U, U.T)
|
||||||
|
_, log_pdet_pinv = _psd_pinv_decomposed_log_pdet(pinv, cond)
|
||||||
|
|
||||||
|
# Check that the log pseudo-determinant agrees with the sum
|
||||||
|
# of the logs of all but the smallest eigenvalue
|
||||||
|
assert_allclose(log_pdet, np.sum(np.log(s[:-1])))
|
||||||
|
# Check that the pseudo-determinant of the pseudo-inverse
|
||||||
|
# agrees with 1 / pseudo-determinant
|
||||||
|
assert_allclose(-log_pdet, log_pdet_pinv)
|
||||||
|
|
||||||
|
|
||||||
|
def test_exception_nonsquare_cov():
|
||||||
|
cov = [[1, 2, 3], [4, 5, 6]]
|
||||||
|
assert_raises(ValueError, _psd_pinv_decomposed_log_pdet, cov)
|
||||||
|
|
||||||
|
|
||||||
|
def test_exception_nonfinite_cov():
|
||||||
|
cov_nan = [[1, 0], [0, np.nan]]
|
||||||
|
assert_raises(ValueError, _psd_pinv_decomposed_log_pdet, cov_nan)
|
||||||
|
cov_inf = [[1, 0], [0, np.inf]]
|
||||||
|
assert_raises(ValueError, _psd_pinv_decomposed_log_pdet, cov_inf)
|
||||||
|
|
||||||
|
|
||||||
|
def test_exception_non_psd_cov():
|
||||||
|
cov = [[1, 0], [0, -1]]
|
||||||
|
assert_raises(ValueError, _psd_pinv_decomposed_log_pdet, cov)
|
||||||
|
|
||||||
|
|
||||||
|
def test_R_values():
|
||||||
|
# Compare the multivariate pdf with some values precomputed
|
||||||
|
# in R version 3.0.1 (2013-05-16) on Mac OS X 10.6.
|
||||||
|
|
||||||
|
# The values below were generated by the following R-script:
|
||||||
|
# > library(mnormt)
|
||||||
|
# > x <- seq(0, 2, length=5)
|
||||||
|
# > y <- 3*x - 2
|
||||||
|
# > z <- x + cos(y)
|
||||||
|
# > mu <- c(1, 3, 2)
|
||||||
|
# > Sigma <- matrix(c(1,2,0,2,5,0.5,0,0.5,3), 3, 3)
|
||||||
|
# > r_pdf <- dmnorm(cbind(x,y,z), mu, Sigma)
|
||||||
|
r_pdf = np.array([0.0002214706, 0.0013819953, 0.0049138692,
|
||||||
|
0.0103803050, 0.0140250800])
|
||||||
|
|
||||||
|
x = np.linspace(0, 2, 5)
|
||||||
|
y = 3 * x - 2
|
||||||
|
z = x + np.cos(y)
|
||||||
|
r = np.array([x, y, z]).T
|
||||||
|
|
||||||
|
mean = np.array([1, 3, 2], 'd')
|
||||||
|
cov = np.array([[1, 2, 0], [2, 5, .5], [0, .5, 3]], 'd')
|
||||||
|
|
||||||
|
pdf = multivariate_normal.pdf(r, mean, cov)
|
||||||
|
assert_allclose(pdf, r_pdf, atol=1e-10)
|
||||||
|
|
||||||
|
|
||||||
|
def test_rvs_shape():
|
||||||
|
# Check that rvs parses the mean and covariance correctly, and returns
|
||||||
|
# an array of the right shape
|
||||||
|
N = 300
|
||||||
|
d = 4
|
||||||
|
sample = multivariate_normal.rvs(mean=np.zeros(d), cov=1, size=N)
|
||||||
|
assert_equal(sample.shape, (N, d))
|
||||||
|
|
||||||
|
sample = multivariate_normal.rvs(mean=None,
|
||||||
|
cov=np.array([[2, .1], [.1, 1]]),
|
||||||
|
size=N)
|
||||||
|
assert_equal(sample.shape, (N, 2))
|
||||||
|
|
||||||
|
u = multivariate_normal(mean=0, cov=1)
|
||||||
|
sample = u.rvs(N)
|
||||||
|
assert_equal(sample.shape, (N, ))
|
||||||
|
|
||||||
|
|
||||||
|
def test_large_sample():
|
||||||
|
# Generate large sample and compare sample mean and sample covariance
|
||||||
|
# with mean and covariance matrix.
|
||||||
|
|
||||||
|
np.random.seed(2846)
|
||||||
|
|
||||||
|
n = 3
|
||||||
|
mean = np.random.randn(n)
|
||||||
|
M = np.random.randn(n, n)
|
||||||
|
cov = np.dot(M, M.T)
|
||||||
|
size = 5000
|
||||||
|
|
||||||
|
sample = multivariate_normal.rvs(mean, cov, size)
|
||||||
|
|
||||||
|
assert_allclose(numpy.cov(sample.T), cov, rtol=1e-1)
|
||||||
|
assert_allclose(sample.mean(0), mean, rtol=1e-1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_entropy():
|
||||||
|
np.random.seed(2846)
|
||||||
|
|
||||||
|
n = 3
|
||||||
|
mean = np.random.randn(n)
|
||||||
|
M = np.random.randn(n, n)
|
||||||
|
cov = np.dot(M, M.T)
|
||||||
|
|
||||||
|
rv = multivariate_normal(mean, cov)
|
||||||
|
|
||||||
|
# Check that frozen distribution agrees with entropy function
|
||||||
|
assert_almost_equal(rv.entropy(), multivariate_normal.entropy(mean, cov))
|
||||||
|
# Compare entropy with manually computed expression involving
|
||||||
|
# the sum of the logs of the eigenvalues of the covariance matrix
|
||||||
|
eigs = np.linalg.eig(cov)[0]
|
||||||
|
desired = 1/2 * (n * (np.log(2*np.pi) + 1) + np.sum(np.log(eigs)))
|
||||||
|
assert_almost_equal(desired, rv.entropy())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_module_suite()
|
@ -0,0 +1,193 @@
|
|||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from numpy.testing import TestCase, run_module_suite, assert_equal, \
|
||||||
|
assert_array_equal
|
||||||
|
|
||||||
|
from scipy.stats import rankdata, tiecorrect
|
||||||
|
|
||||||
|
|
||||||
|
class TestTieCorrect(TestCase):
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
"""An empty array requires no correction, should return 1.0."""
|
||||||
|
ranks = np.array([], dtype=np.float64)
|
||||||
|
c = tiecorrect(ranks)
|
||||||
|
assert_equal(c, 1.0)
|
||||||
|
|
||||||
|
def test_one(self):
|
||||||
|
"""A single element requires no correction, should return 1.0."""
|
||||||
|
ranks = np.array([1.0], dtype=np.float64)
|
||||||
|
c = tiecorrect(ranks)
|
||||||
|
assert_equal(c, 1.0)
|
||||||
|
|
||||||
|
def test_no_correction(self):
|
||||||
|
"""Arrays with no ties require no correction."""
|
||||||
|
ranks = np.arange(2.0)
|
||||||
|
c = tiecorrect(ranks)
|
||||||
|
assert_equal(c, 1.0)
|
||||||
|
ranks = np.arange(3.0)
|
||||||
|
c = tiecorrect(ranks)
|
||||||
|
assert_equal(c, 1.0)
|
||||||
|
|
||||||
|
def test_basic(self):
|
||||||
|
"""Check a few basic examples of the tie correction factor."""
|
||||||
|
# One tie of two elements
|
||||||
|
ranks = np.array([1.0, 2.5, 2.5])
|
||||||
|
c = tiecorrect(ranks)
|
||||||
|
T = 2.0
|
||||||
|
N = ranks.size
|
||||||
|
expected = 1.0 - (T**3 - T) / (N**3 - N)
|
||||||
|
assert_equal(c, expected)
|
||||||
|
|
||||||
|
# One tie of two elements (same as above, but tie is not at the end)
|
||||||
|
ranks = np.array([1.5, 1.5, 3.0])
|
||||||
|
c = tiecorrect(ranks)
|
||||||
|
T = 2.0
|
||||||
|
N = ranks.size
|
||||||
|
expected = 1.0 - (T**3 - T) / (N**3 - N)
|
||||||
|
assert_equal(c, expected)
|
||||||
|
|
||||||
|
# One tie of three elements
|
||||||
|
ranks = np.array([1.0, 3.0, 3.0, 3.0])
|
||||||
|
c = tiecorrect(ranks)
|
||||||
|
T = 3.0
|
||||||
|
N = ranks.size
|
||||||
|
expected = 1.0 - (T**3 - T) / (N**3 - N)
|
||||||
|
assert_equal(c, expected)
|
||||||
|
|
||||||
|
# Two ties, lengths 2 and 3.
|
||||||
|
ranks = np.array([1.5, 1.5, 4.0, 4.0, 4.0])
|
||||||
|
c = tiecorrect(ranks)
|
||||||
|
T1 = 2.0
|
||||||
|
T2 = 3.0
|
||||||
|
N = ranks.size
|
||||||
|
expected = 1.0 - ((T1**3 - T1) + (T2**3 - T2)) / (N**3 - N)
|
||||||
|
assert_equal(c, expected)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRankData(TestCase):
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
"""stats.rankdata([]) should return an empty array."""
|
||||||
|
a = np.array([], dtype=np.int)
|
||||||
|
r = rankdata(a)
|
||||||
|
assert_array_equal(r, np.array([], dtype=np.float64))
|
||||||
|
r = rankdata([])
|
||||||
|
assert_array_equal(r, np.array([], dtype=np.float64))
|
||||||
|
|
||||||
|
def test_one(self):
|
||||||
|
"""Check stats.rankdata with an array of length 1."""
|
||||||
|
data = [100]
|
||||||
|
a = np.array(data, dtype=np.int)
|
||||||
|
r = rankdata(a)
|
||||||
|
assert_array_equal(r, np.array([1.0], dtype=np.float64))
|
||||||
|
r = rankdata(data)
|
||||||
|
assert_array_equal(r, np.array([1.0], dtype=np.float64))
|
||||||
|
|
||||||
|
def test_basic(self):
|
||||||
|
"""Basic tests of stats.rankdata."""
|
||||||
|
data = [100, 10, 50]
|
||||||
|
expected = np.array([3.0, 1.0, 2.0], dtype=np.float64)
|
||||||
|
a = np.array(data, dtype=np.int)
|
||||||
|
r = rankdata(a)
|
||||||
|
assert_array_equal(r, expected)
|
||||||
|
r = rankdata(data)
|
||||||
|
assert_array_equal(r, expected)
|
||||||
|
|
||||||
|
data = [40, 10, 30, 10, 50]
|
||||||
|
expected = np.array([4.0, 1.5, 3.0, 1.5, 5.0], dtype=np.float64)
|
||||||
|
a = np.array(data, dtype=np.int)
|
||||||
|
r = rankdata(a)
|
||||||
|
assert_array_equal(r, expected)
|
||||||
|
r = rankdata(data)
|
||||||
|
assert_array_equal(r, expected)
|
||||||
|
|
||||||
|
data = [20, 20, 20, 10, 10, 10]
|
||||||
|
expected = np.array([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=np.float64)
|
||||||
|
a = np.array(data, dtype=np.int)
|
||||||
|
r = rankdata(a)
|
||||||
|
assert_array_equal(r, expected)
|
||||||
|
r = rankdata(data)
|
||||||
|
assert_array_equal(r, expected)
|
||||||
|
# The docstring states explicitly that the argument is flattened.
|
||||||
|
a2d = a.reshape(2, 3)
|
||||||
|
r = rankdata(a2d)
|
||||||
|
assert_array_equal(r, expected)
|
||||||
|
|
||||||
|
def test_large_int(self):
|
||||||
|
data = np.array([2**60, 2**60+1], dtype=np.uint64)
|
||||||
|
r = rankdata(data)
|
||||||
|
assert_array_equal(r, [1.0, 2.0])
|
||||||
|
|
||||||
|
data = np.array([2**60, 2**60+1], dtype=np.int64)
|
||||||
|
r = rankdata(data)
|
||||||
|
assert_array_equal(r, [1.0, 2.0])
|
||||||
|
|
||||||
|
data = np.array([2**60, -2**60+1], dtype=np.int64)
|
||||||
|
r = rankdata(data)
|
||||||
|
assert_array_equal(r, [2.0, 1.0])
|
||||||
|
|
||||||
|
def test_big_tie(self):
|
||||||
|
for n in [10000, 100000, 1000000]:
|
||||||
|
data = np.ones(n, dtype=int)
|
||||||
|
r = rankdata(data)
|
||||||
|
expected_rank = 0.5 * (n + 1)
|
||||||
|
assert_array_equal(r, expected_rank * data,
|
||||||
|
"test failed with n=%d" % n)
|
||||||
|
|
||||||
|
|
||||||
|
_cases = (
|
||||||
|
# values, method, expected
|
||||||
|
([], 'average', []),
|
||||||
|
([], 'min', []),
|
||||||
|
([], 'max', []),
|
||||||
|
([], 'dense', []),
|
||||||
|
([], 'ordinal', []),
|
||||||
|
#
|
||||||
|
([100], 'average', [1.0]),
|
||||||
|
([100], 'min', [1.0]),
|
||||||
|
([100], 'max', [1.0]),
|
||||||
|
([100], 'dense', [1.0]),
|
||||||
|
([100], 'ordinal', [1.0]),
|
||||||
|
#
|
||||||
|
([100, 100, 100], 'average', [2.0, 2.0, 2.0]),
|
||||||
|
([100, 100, 100], 'min', [1.0, 1.0, 1.0]),
|
||||||
|
([100, 100, 100], 'max', [3.0, 3.0, 3.0]),
|
||||||
|
([100, 100, 100], 'dense', [1.0, 1.0, 1.0]),
|
||||||
|
([100, 100, 100], 'ordinal', [1.0, 2.0, 3.0]),
|
||||||
|
#
|
||||||
|
([100, 300, 200], 'average', [1.0, 3.0, 2.0]),
|
||||||
|
([100, 300, 200], 'min', [1.0, 3.0, 2.0]),
|
||||||
|
([100, 300, 200], 'max', [1.0, 3.0, 2.0]),
|
||||||
|
([100, 300, 200], 'dense', [1.0, 3.0, 2.0]),
|
||||||
|
([100, 300, 200], 'ordinal', [1.0, 3.0, 2.0]),
|
||||||
|
#
|
||||||
|
([100, 200, 300, 200], 'average', [1.0, 2.5, 4.0, 2.5]),
|
||||||
|
([100, 200, 300, 200], 'min', [1.0, 2.0, 4.0, 2.0]),
|
||||||
|
([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]),
|
||||||
|
([100, 200, 300, 200], 'dense', [1.0, 2.0, 3.0, 2.0]),
|
||||||
|
([100, 200, 300, 200], 'ordinal', [1.0, 2.0, 4.0, 3.0]),
|
||||||
|
#
|
||||||
|
([100, 200, 300, 200, 100], 'average', [1.5, 3.5, 5.0, 3.5, 1.5]),
|
||||||
|
([100, 200, 300, 200, 100], 'min', [1.0, 3.0, 5.0, 3.0, 1.0]),
|
||||||
|
([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]),
|
||||||
|
([100, 200, 300, 200, 100], 'dense', [1.0, 2.0, 3.0, 2.0, 1.0]),
|
||||||
|
([100, 200, 300, 200, 100], 'ordinal', [1.0, 3.0, 5.0, 4.0, 2.0]),
|
||||||
|
#
|
||||||
|
([10] * 30, 'ordinal', np.arange(1.0, 31.0)),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_cases():
|
||||||
|
|
||||||
|
def check_case(values, method, expected):
|
||||||
|
r = rankdata(values, method=method)
|
||||||
|
assert_array_equal(r, expected)
|
||||||
|
|
||||||
|
for values, method, expected in _cases:
|
||||||
|
yield check_case, values, method, expected
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_module_suite()
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,91 @@
|
|||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from numpy.testing import assert_allclose, assert_equal, run_module_suite
|
||||||
|
|
||||||
|
from scipy.stats._tukeylambda_stats import tukeylambda_variance, \
|
||||||
|
tukeylambda_kurtosis
|
||||||
|
|
||||||
|
|
||||||
|
def test_tukeylambda_stats_known_exact():
|
||||||
|
"""Compare results with some known exact formulas."""
|
||||||
|
# Some exact values of the Tukey Lambda variance and kurtosis:
|
||||||
|
# lambda var kurtosis
|
||||||
|
# 0 pi**2/3 6/5 (logistic distribution)
|
||||||
|
# 0.5 4 - pi (5/3 - pi/2)/(pi/4 - 1)**2 - 3
|
||||||
|
# 1 1/3 -6/5 (uniform distribution on (-1,1))
|
||||||
|
# 2 1/12 -6/5 (uniform distribution on (-1/2, 1/2))
|
||||||
|
|
||||||
|
# lambda = 0
|
||||||
|
var = tukeylambda_variance(0)
|
||||||
|
assert_allclose(var, np.pi**2 / 3, atol=1e-12)
|
||||||
|
kurt = tukeylambda_kurtosis(0)
|
||||||
|
assert_allclose(kurt, 1.2, atol=1e-10)
|
||||||
|
|
||||||
|
# lambda = 0.5
|
||||||
|
var = tukeylambda_variance(0.5)
|
||||||
|
assert_allclose(var, 4 - np.pi, atol=1e-12)
|
||||||
|
kurt = tukeylambda_kurtosis(0.5)
|
||||||
|
desired = (5./3 - np.pi/2) / (np.pi/4 - 1)**2 - 3
|
||||||
|
assert_allclose(kurt, desired, atol=1e-10)
|
||||||
|
|
||||||
|
# lambda = 1
|
||||||
|
var = tukeylambda_variance(1)
|
||||||
|
assert_allclose(var, 1.0 / 3, atol=1e-12)
|
||||||
|
kurt = tukeylambda_kurtosis(1)
|
||||||
|
assert_allclose(kurt, -1.2, atol=1e-10)
|
||||||
|
|
||||||
|
# lambda = 2
|
||||||
|
var = tukeylambda_variance(2)
|
||||||
|
assert_allclose(var, 1.0 / 12, atol=1e-12)
|
||||||
|
kurt = tukeylambda_kurtosis(2)
|
||||||
|
assert_allclose(kurt, -1.2, atol=1e-10)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tukeylambda_stats_mpmath():
|
||||||
|
"""Compare results with some values that were computed using mpmath."""
|
||||||
|
a10 = dict(atol=1e-10, rtol=0)
|
||||||
|
a12 = dict(atol=1e-12, rtol=0)
|
||||||
|
data = [
|
||||||
|
# lambda variance kurtosis
|
||||||
|
[-0.1, 4.78050217874253547, 3.78559520346454510],
|
||||||
|
[-0.0649, 4.16428023599895777, 2.52019675947435718],
|
||||||
|
[-0.05, 3.93672267890775277, 2.13129793057777277],
|
||||||
|
[-0.001, 3.30128380390964882, 1.21452460083542988],
|
||||||
|
[0.001, 3.27850775649572176, 1.18560634779287585],
|
||||||
|
[0.03125, 2.95927803254615800, 0.804487555161819980],
|
||||||
|
[0.05, 2.78281053405464501, 0.611604043886644327],
|
||||||
|
[0.0649, 2.65282386754100551, 0.476834119532774540],
|
||||||
|
[1.2, 0.242153920578588346, -1.23428047169049726],
|
||||||
|
[10.0, 0.00095237579757703597, 2.37810697355144933],
|
||||||
|
[20.0, 0.00012195121951131043, 7.37654321002709531],
|
||||||
|
]
|
||||||
|
|
||||||
|
for lam, var_expected, kurt_expected in data:
|
||||||
|
var = tukeylambda_variance(lam)
|
||||||
|
assert_allclose(var, var_expected, **a12)
|
||||||
|
kurt = tukeylambda_kurtosis(lam)
|
||||||
|
assert_allclose(kurt, kurt_expected, **a10)
|
||||||
|
|
||||||
|
# Test with vector arguments (most of the other tests are for single
|
||||||
|
# values).
|
||||||
|
lam, var_expected, kurt_expected = zip(*data)
|
||||||
|
var = tukeylambda_variance(lam)
|
||||||
|
assert_allclose(var, var_expected, **a12)
|
||||||
|
kurt = tukeylambda_kurtosis(lam)
|
||||||
|
assert_allclose(kurt, kurt_expected, **a10)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tukeylambda_stats_invalid():
|
||||||
|
"""Test values of lambda outside the domains of the functions."""
|
||||||
|
lam = [-1.0, -0.5]
|
||||||
|
var = tukeylambda_variance(lam)
|
||||||
|
assert_equal(var, np.array([np.nan, np.inf]))
|
||||||
|
|
||||||
|
lam = [-1.0, -0.25]
|
||||||
|
kurt = tukeylambda_kurtosis(lam)
|
||||||
|
assert_equal(kurt, np.array([np.nan, np.inf]))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_module_suite()
|
@ -0,0 +1,47 @@
|
|||||||
|
from __future__ import division, print_function, absolute_import
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import scipy.stats
|
||||||
|
from scipy.special import i0
|
||||||
|
|
||||||
|
|
||||||
|
def von_mises_cdf_series(k,x,p):
|
||||||
|
x = float(x)
|
||||||
|
s = np.sin(x)
|
||||||
|
c = np.cos(x)
|
||||||
|
sn = np.sin(p*x)
|
||||||
|
cn = np.cos(p*x)
|
||||||
|
R = 0
|
||||||
|
V = 0
|
||||||
|
for n in range(p-1,0,-1):
|
||||||
|
sn, cn = sn*c - cn*s, cn*c + sn*s
|
||||||
|
R = 1./(2*n/k + R)
|
||||||
|
V = R*(sn/n+V)
|
||||||
|
|
||||||
|
return 0.5+x/(2*np.pi) + V/np.pi
|
||||||
|
|
||||||
|
|
||||||
|
def von_mises_cdf_normalapprox(k,x,C1):
|
||||||
|
b = np.sqrt(2/np.pi)*np.exp(k)/i0(k)
|
||||||
|
z = b*np.sin(x/2.)
|
||||||
|
return scipy.stats.norm.cdf(z)
|
||||||
|
|
||||||
|
|
||||||
|
def von_mises_cdf(k,x):
|
||||||
|
ix = 2*np.pi*np.round(x/(2*np.pi))
|
||||||
|
x = x-ix
|
||||||
|
k = float(k)
|
||||||
|
|
||||||
|
# These values should give 12 decimal digits
|
||||||
|
CK = 50
|
||||||
|
a = [28., 0.5, 100., 5.0]
|
||||||
|
C1 = 50.1
|
||||||
|
|
||||||
|
if k < CK:
|
||||||
|
p = int(np.ceil(a[0]+a[1]*k-a[2]/(k+a[3])))
|
||||||
|
|
||||||
|
F = np.clip(von_mises_cdf_series(k,x,p),0,1)
|
||||||
|
else:
|
||||||
|
F = von_mises_cdf_normalapprox(k,x,C1)
|
||||||
|
|
||||||
|
return F+ix
|
@ -0,0 +1,76 @@
|
|||||||
|
import numpy as np
|
||||||
|
import scipy.stats
|
||||||
|
from scipy.special import i0
|
||||||
|
import numpy.testing
|
||||||
|
cimport numpy as np
|
||||||
|
|
||||||
|
cdef extern from "math.h":
|
||||||
|
double cos(double theta)
|
||||||
|
double sin(double theta)
|
||||||
|
|
||||||
|
|
||||||
|
cdef double von_mises_cdf_series(double k,double x,unsigned int p):
|
||||||
|
cdef double s, c, sn, cn, R, V
|
||||||
|
cdef unsigned int n
|
||||||
|
s = sin(x)
|
||||||
|
c = cos(x)
|
||||||
|
sn = sin(p*x)
|
||||||
|
cn = cos(p*x)
|
||||||
|
R = 0
|
||||||
|
V = 0
|
||||||
|
for n in range(p-1,0,-1):
|
||||||
|
sn, cn = sn*c - cn*s, cn*c + sn*s
|
||||||
|
R = 1./(2*n/k + R)
|
||||||
|
V = R*(sn/n+V)
|
||||||
|
|
||||||
|
return 0.5+x/(2*np.pi) + V/np.pi
|
||||||
|
|
||||||
|
def von_mises_cdf_normalapprox(k,x,C1):
|
||||||
|
b = np.sqrt(2/np.pi)*np.exp(k)/i0(k)
|
||||||
|
z = b*np.sin(x/2.)
|
||||||
|
C = 24*k
|
||||||
|
chi = z - z**3/((C-2*z**2-16)/3.-(z**4+7/4.*z**2+167./2)/(C+C1-z**2+3))**2
|
||||||
|
return scipy.stats.norm.cdf(z)
|
||||||
|
|
||||||
|
cimport cython
|
||||||
|
@cython.boundscheck(False)
|
||||||
|
def von_mises_cdf(k,x):
|
||||||
|
cdef np.ndarray[double, ndim=1] temp, temp_xs, temp_ks
|
||||||
|
cdef unsigned int i, p
|
||||||
|
cdef double a1, a2, a3, a4, C1, CK
|
||||||
|
#k,x = np.broadcast_arrays(np.asarray(k),np.asarray(x))
|
||||||
|
k = np.asarray(k)
|
||||||
|
x = np.asarray(x)
|
||||||
|
zerodim = k.ndim==0 and x.ndim==0
|
||||||
|
|
||||||
|
k = np.atleast_1d(k)
|
||||||
|
x = np.atleast_1d(x)
|
||||||
|
ix = np.round(x/(2*np.pi))
|
||||||
|
x = x-ix*2*np.pi
|
||||||
|
|
||||||
|
# These values should give 12 decimal digits
|
||||||
|
CK=50
|
||||||
|
a1, a2, a3, a4 = [28., 0.5, 100., 5.0]
|
||||||
|
C1 = 50.1
|
||||||
|
|
||||||
|
bx, bk = np.broadcast_arrays(x,k)
|
||||||
|
result = np.empty(bx.shape,dtype=np.float)
|
||||||
|
|
||||||
|
c_small_k = bk<CK
|
||||||
|
temp = result[c_small_k]
|
||||||
|
temp_xs = bx[c_small_k].astype(np.float)
|
||||||
|
temp_ks = bk[c_small_k].astype(np.float)
|
||||||
|
for i in range(len(temp)):
|
||||||
|
p = <int>(1+a1+a2*temp_ks[i]-a3/(temp_ks[i]+a4))
|
||||||
|
temp[i] = von_mises_cdf_series(temp_ks[i],temp_xs[i],p)
|
||||||
|
if temp[i]<0:
|
||||||
|
temp[i]=0
|
||||||
|
elif temp[i]>1:
|
||||||
|
temp[i]=1
|
||||||
|
result[c_small_k] = temp
|
||||||
|
result[~c_small_k] = von_mises_cdf_normalapprox(bk[~c_small_k],bx[~c_small_k],C1)
|
||||||
|
|
||||||
|
if not zerodim:
|
||||||
|
return result+ix
|
||||||
|
else:
|
||||||
|
return (result+ix)[0]
|
Loading…
Reference in New Issue