Simplified wafo.stats:

-Deleted obsolete files.
 -Requires scipy v0.16
 -._distn_infrastructure.py monkeypatch scipy.stats._distn_infrastructure.py
master
pbrod 9 years ago
parent 7403d821df
commit e73624161c

@ -8,50 +8,14 @@ Statistical functions (:mod:`scipy.stats`)
This module contains a large number of probability distributions as
well as a growing library of statistical functions.
Each included distribution is an instance of the class rv_continuous:
For each given name the following methods are available:
Each univariate distribution is an instance of a subclass of `rv_continuous`
(`rv_discrete` for discrete distributions):
.. autosummary::
:toctree: generated/
rv_continuous
rv_continuous.pdf
rv_continuous.logpdf
rv_continuous.cdf
rv_continuous.logcdf
rv_continuous.sf
rv_continuous.logsf
rv_continuous.ppf
rv_continuous.isf
rv_continuous.moment
rv_continuous.stats
rv_continuous.entropy
rv_continuous.fit
rv_continuous.expect
Calling the instance as a function returns a frozen pdf whose shape,
location, and scale parameters are fixed.
Similarly, each discrete distribution is an instance of the class
rv_discrete:
.. autosummary::
:toctree: generated/
rv_discrete
rv_discrete.rvs
rv_discrete.pmf
rv_discrete.logpmf
rv_discrete.cdf
rv_discrete.logcdf
rv_discrete.sf
rv_discrete.logsf
rv_discrete.ppf
rv_discrete.isf
rv_discrete.stats
rv_discrete.moment
rv_discrete.entropy
rv_discrete.expect
Continuous distributions
========================
@ -65,7 +29,8 @@ Continuous distributions
beta -- Beta
betaprime -- Beta Prime
bradford -- Bradford
burr -- Burr
burr -- Burr (Type III)
burr12 -- Burr (Type XII)
cauchy -- Cauchy
chi -- Chi
chi2 -- Chi-squared
@ -74,6 +39,7 @@ Continuous distributions
dweibull -- Double Weibull
erlang -- Erlang
expon -- Exponential
exponnorm -- Exponentially Modified Normal
exponweib -- Exponentiated Weibull
exponpow -- Exponential Power
f -- F (Snecdor F)
@ -84,6 +50,7 @@ Continuous distributions
frechet_r -- Frechet Right Sided, Extreme Value Type II (Extreme LB) or weibull_min
frechet_l -- Frechet Left Sided, Weibull_max
genlogistic -- Generalized Logistic
gennorm -- Generalized normal
genpareto -- Generalized Pareto
genexpon -- Generalized Exponential
genextreme -- Generalized Extreme Value
@ -98,6 +65,7 @@ Continuous distributions
halfcauchy -- Half Cauchy
halflogistic -- Half Logistic
halfnorm -- Half Normal
halfgennorm -- Generalized Half Normal
hypsecant -- Hyperbolic Secant
invgamma -- Inverse Gamma
invgauss -- Inverse Gaussian
@ -107,6 +75,9 @@ Continuous distributions
ksone -- Kolmogorov-Smirnov one-sided (no stats)
kstwobign -- Kolmogorov-Smirnov two-sided test for Large N (no stats)
laplace -- Laplace
levy -- Levy
levy_l
levy_stable
logistic -- Logistic
loggamma -- Log-Gamma
loglaplace -- Log-Laplace (Log Double Exponential)
@ -130,6 +101,7 @@ Continuous distributions
rice -- Rice
recipinvgauss -- Reciprocal Inverse Gaussian
semicircular -- Semicircular
skewnorm -- Skew normal
t -- Student's T
triang -- Triangular
truncexpon -- Truncated Exponential
@ -137,6 +109,7 @@ Continuous distributions
tukeylambda -- Tukey-Lambda
uniform -- Uniform
vonmises -- Von-Mises (Circular)
vonmises_line -- Von-Mises (Line)
wald -- Wald
weibull_min -- Minimum Weibull (see Frechet)
weibull_max -- Maximum Weibull (see Frechet)
@ -149,7 +122,12 @@ Multivariate distributions
:toctree: generated/
multivariate_normal -- Multivariate normal distribution
matrix_normal -- Matrix normal distribution
dirichlet -- Dirichlet
wishart -- Wishart
invwishart -- Inverse Wishart
special_ortho_group -- SO(N) group
ortho_group -- O(N) group
Discrete distributions
======================
@ -190,27 +168,28 @@ which work for masked arrays.
normaltest --
skew -- Skewness
skewtest --
kstat --
kstatvar --
tmean -- Truncated arithmetic mean
tvar -- Truncated variance
tmin --
tmax --
tstd --
tsem --
nanmean -- Mean, ignoring NaN values
nanstd -- Standard deviation, ignoring NaN values
nanmedian -- Median, ignoring NaN values
variation -- Coefficient of variation
find_repeats
trim_mean
.. autosummary::
:toctree: generated/
cumfreq _
histogram2 _
histogram _
itemfreq _
percentileofscore _
scoreatpercentile _
relfreq _
cumfreq
histogram2
histogram
itemfreq
percentileofscore
scoreatpercentile
relfreq
.. autosummary::
:toctree: generated/
@ -225,6 +204,7 @@ which work for masked arrays.
obrientransform
signaltonoise
bayes_mvs
mvsdist
sem
zmap
zscore
@ -247,12 +227,14 @@ which work for masked arrays.
kendalltau
linregress
theilslopes
f_value
.. autosummary::
:toctree: generated/
ttest_1samp
ttest_ind
ttest_ind_from_stats
ttest_rel
kstest
chisquare
@ -265,6 +247,10 @@ which work for masked arrays.
wilcoxon
kruskal
friedmanchisquare
combine_pvalues
ss
square_of_sums
jarque_bera
.. autosummary::
:toctree: generated/
@ -289,6 +275,22 @@ which work for masked arrays.
entropy
.. autosummary::
:toctree: generated/
chisqprob
betai
Circular statistical functions
==============================
.. autosummary::
:toctree: generated/
circmean
circvar
circstd
Contingency table functions
===========================
@ -335,21 +337,11 @@ interface package rpy.
from __future__ import division, print_function, absolute_import
from scipy.stats import *
from .core import *
from .stats import *
from .distributions import *
from .rv import *
from .morestats import *
from ._binned_statistic import *
from .kde import gaussian_kde
from . import mstats
from .contingency import chi2_contingency
from ._multivariate import *
from . import estimation
#remove vonmises_cython from __all__, I don't know why it is included
__all__ = [s for s in dir() if not (s.startswith('_') or s.endswith('cython'))]
#import distributions #@Reimport
#from wafo.stats.distributions import *
from numpy.testing import Tester
test = Tester().test

@ -1,408 +0,0 @@
from __future__ import division, print_function, absolute_import
import warnings
import numpy as np
from scipy._lib.six import callable
def binned_statistic(x, values, statistic='mean',
bins=10, range=None):
"""
Compute a binned statistic for a set of data.
This is a generalization of a histogram function. A histogram divides
the space into bins, and returns the count of the number of points in
each bin. This function allows the computation of the sum, mean, median,
or other statistic of the values within each bin.
Parameters
----------
x : array_like
A sequence of values to be binned.
values : array_like
The values on which the statistic will be computed. This must be
the same shape as `x`.
statistic : string or callable, optional
The statistic to compute (default is 'mean').
The following statistics are available:
* 'mean' : compute the mean of values for points within each bin.
Empty bins will be represented by NaN.
* 'median' : compute the median of values for points within each
bin. Empty bins will be represented by NaN.
* 'count' : compute the count of points within each bin. This is
identical to an unweighted histogram. `values` array is not
referenced.
* 'sum' : compute the sum of values for points within each bin.
This is identical to a weighted histogram.
* function : a user-defined function which takes a 1D array of
values, and outputs a single numerical statistic. This function
will be called on the values in each bin. Empty bins will be
represented by function([]), or NaN if this returns an error.
bins : int or sequence of scalars, optional
If `bins` is an int, it defines the number of equal-width
bins in the given range (10, by default). If `bins` is a sequence,
it defines the bin edges, including the rightmost edge, allowing
for non-uniform bin widths.
range : (float, float) or [(float, float)], optional
The lower and upper range of the bins. If not provided, range
is simply ``(x.min(), x.max())``. Values outside the range are
ignored.
Returns
-------
statistic : array
The values of the selected statistic in each bin.
bin_edges : array of dtype float
Return the bin edges ``(length(statistic)+1)``.
binnumber : 1-D ndarray of ints
This assigns to each observation an integer that represents the bin
in which this observation falls. Array has the same length as values.
See Also
--------
numpy.histogram, binned_statistic_2d, binned_statistic_dd
Notes
-----
All but the last (righthand-most) bin is half-open. In other words, if
`bins` is::
[1, 2, 3, 4]
then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the
second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes*
4.
.. versionadded:: 0.11.0
Examples
--------
>>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
... bins=3)
(array([ 1., 2., 4.]), array([ 1., 2., 3., 4.]), array([1, 2, 1, 2, 3]))
>>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean', bins=3)
(array([ 1., 2., 4.]), array([ 1., 2., 3., 4.]), array([1, 2, 1, 2, 3]))
"""
try:
N = len(bins)
except TypeError:
N = 1
if N != 1:
bins = [np.asarray(bins, float)]
if range is not None:
if len(range) == 2:
range = [range]
medians, edges, xy = binned_statistic_dd([x], values, statistic,
bins, range)
return medians, edges[0], xy
def binned_statistic_2d(x, y, values, statistic='mean',
bins=10, range=None):
"""
Compute a bidimensional binned statistic for a set of data.
This is a generalization of a histogram2d function. A histogram divides
the space into bins, and returns the count of the number of points in
each bin. This function allows the computation of the sum, mean, median,
or other statistic of the values within each bin.
Parameters
----------
x : (N,) array_like
A sequence of values to be binned along the first dimension.
y : (M,) array_like
A sequence of values to be binned along the second dimension.
values : (N,) array_like
The values on which the statistic will be computed. This must be
the same shape as `x`.
statistic : string or callable, optional
The statistic to compute (default is 'mean').
The following statistics are available:
* 'mean' : compute the mean of values for points within each bin.
Empty bins will be represented by NaN.
* 'median' : compute the median of values for points within each
bin. Empty bins will be represented by NaN.
* 'count' : compute the count of points within each bin. This is
identical to an unweighted histogram. `values` array is not
referenced.
* 'sum' : compute the sum of values for points within each bin.
This is identical to a weighted histogram.
* function : a user-defined function which takes a 1D array of
values, and outputs a single numerical statistic. This function
will be called on the values in each bin. Empty bins will be
represented by function([]), or NaN if this returns an error.
bins : int or [int, int] or array-like or [array, array], optional
The bin specification:
* the number of bins for the two dimensions (nx=ny=bins),
* the number of bins in each dimension (nx, ny = bins),
* the bin edges for the two dimensions (x_edges = y_edges = bins),
* the bin edges in each dimension (x_edges, y_edges = bins).
range : (2,2) array_like, optional
The leftmost and rightmost edges of the bins along each dimension
(if not specified explicitly in the `bins` parameters):
[[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
considered outliers and not tallied in the histogram.
Returns
-------
statistic : (nx, ny) ndarray
The values of the selected statistic in each two-dimensional bin
xedges : (nx + 1) ndarray
The bin edges along the first dimension.
yedges : (ny + 1) ndarray
The bin edges along the second dimension.
binnumber : 1-D ndarray of ints
This assigns to each observation an integer that represents the bin
in which this observation falls. Array has the same length as `values`.
See Also
--------
numpy.histogram2d, binned_statistic, binned_statistic_dd
Notes
-----
.. versionadded:: 0.11.0
"""
# This code is based on np.histogram2d
try:
N = len(bins)
except TypeError:
N = 1
if N != 1 and N != 2:
xedges = yedges = np.asarray(bins, float)
bins = [xedges, yedges]
medians, edges, xy = binned_statistic_dd([x, y], values, statistic,
bins, range)
return medians, edges[0], edges[1], xy
def binned_statistic_dd(sample, values, statistic='mean',
bins=10, range=None):
"""
Compute a multidimensional binned statistic for a set of data.
This is a generalization of a histogramdd function. A histogram divides
the space into bins, and returns the count of the number of points in
each bin. This function allows the computation of the sum, mean, median,
or other statistic of the values within each bin.
Parameters
----------
sample : array_like
Data to histogram passed as a sequence of D arrays of length N, or
as an (N,D) array.
values : array_like
The values on which the statistic will be computed. This must be
the same shape as x.
statistic : string or callable, optional
The statistic to compute (default is 'mean').
The following statistics are available:
* 'mean' : compute the mean of values for points within each bin.
Empty bins will be represented by NaN.
* 'median' : compute the median of values for points within each
bin. Empty bins will be represented by NaN.
* 'count' : compute the count of points within each bin. This is
identical to an unweighted histogram. `values` array is not
referenced.
* 'sum' : compute the sum of values for points within each bin.
This is identical to a weighted histogram.
* function : a user-defined function which takes a 1D array of
values, and outputs a single numerical statistic. This function
will be called on the values in each bin. Empty bins will be
represented by function([]), or NaN if this returns an error.
bins : sequence or int, optional
The bin specification:
* A sequence of arrays describing the bin edges along each dimension.
* The number of bins for each dimension (nx, ny, ... =bins)
* The number of bins for all dimensions (nx=ny=...=bins).
range : sequence, optional
A sequence of lower and upper bin edges to be used if the edges are
not given explicitely in `bins`. Defaults to the minimum and maximum
values along each dimension.
Returns
-------
statistic : ndarray, shape(nx1, nx2, nx3,...)
The values of the selected statistic in each two-dimensional bin
edges : list of ndarrays
A list of D arrays describing the (nxi + 1) bin edges for each
dimension
binnumber : 1-D ndarray of ints
This assigns to each observation an integer that represents the bin
in which this observation falls. Array has the same length as values.
See Also
--------
np.histogramdd, binned_statistic, binned_statistic_2d
Notes
-----
.. versionadded:: 0.11.0
"""
if type(statistic) == str:
if statistic not in ['mean', 'median', 'count', 'sum', 'std']:
raise ValueError('unrecognized statistic "%s"' % statistic)
elif callable(statistic):
pass
else:
raise ValueError("statistic not understood")
# This code is based on np.histogramdd
try:
# Sample is an ND-array.
N, D = sample.shape
except (AttributeError, ValueError):
# Sample is a sequence of 1D arrays.
sample = np.atleast_2d(sample).T
N, D = sample.shape
nbin = np.empty(D, int)
edges = D * [None]
dedges = D * [None]
try:
M = len(bins)
if M != D:
raise AttributeError('The dimension of bins must be equal '
'to the dimension of the sample x.')
except TypeError:
bins = D * [bins]
# Select range for each dimension
# Used only if number of bins is given.
if range is None:
smin = np.atleast_1d(np.array(sample.min(0), float))
smax = np.atleast_1d(np.array(sample.max(0), float))
else:
smin = np.zeros(D)
smax = np.zeros(D)
for i in np.arange(D):
smin[i], smax[i] = range[i]
# Make sure the bins have a finite width.
for i in np.arange(len(smin)):
if smin[i] == smax[i]:
smin[i] = smin[i] - .5
smax[i] = smax[i] + .5
# Create edge arrays
for i in np.arange(D):
if np.isscalar(bins[i]):
nbin[i] = bins[i] + 2 # +2 for outlier bins
edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1)
else:
edges[i] = np.asarray(bins[i], float)
nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
dedges[i] = np.diff(edges[i])
nbin = np.asarray(nbin)
# Compute the bin number each sample falls into.
Ncount = {}
for i in np.arange(D):
Ncount[i] = np.digitize(sample[:, i], edges[i])
# Using digitize, values that fall on an edge are put in the right bin.
# For the rightmost bin, we want values equal to the right
# edge to be counted in the last bin, and not as an outlier.
for i in np.arange(D):
# Rounding precision
decimal = int(-np.log10(dedges[i].min())) + 6
# Find which points are on the rightmost edge.
on_edge = np.where(np.around(sample[:, i], decimal)
== np.around(edges[i][-1], decimal))[0]
# Shift these points one bin to the left.
Ncount[i][on_edge] -= 1
# Compute the sample indices in the flattened statistic matrix.
ni = nbin.argsort()
xy = np.zeros(N, int)
for i in np.arange(0, D - 1):
xy += Ncount[ni[i]] * nbin[ni[i + 1:]].prod()
xy += Ncount[ni[-1]]
result = np.empty(nbin.prod(), float)
if statistic == 'mean':
result.fill(np.nan)
flatcount = np.bincount(xy, None)
flatsum = np.bincount(xy, values)
a = flatcount.nonzero()
result[a] = flatsum[a] / flatcount[a]
elif statistic == 'std':
result.fill(0)
flatcount = np.bincount(xy, None)
flatsum = np.bincount(xy, values)
flatsum2 = np.bincount(xy, values ** 2)
a = flatcount.nonzero()
result[a] = np.sqrt(flatsum2[a] / flatcount[a]
- (flatsum[a] / flatcount[a]) ** 2)
elif statistic == 'count':
result.fill(0)
flatcount = np.bincount(xy, None)
a = np.arange(len(flatcount))
result[a] = flatcount
elif statistic == 'sum':
result.fill(0)
flatsum = np.bincount(xy, values)
a = np.arange(len(flatsum))
result[a] = flatsum
elif statistic == 'median':
result.fill(np.nan)
for i in np.unique(xy):
result[i] = np.median(values[xy == i])
elif callable(statistic):
with warnings.catch_warnings():
# Numpy generates a warnings for mean/std/... with empty list
warnings.filterwarnings('ignore', category=RuntimeWarning)
old = np.seterr(invalid='ignore')
try:
null = statistic([])
except:
null = np.nan
np.seterr(**old)
result.fill(null)
for i in np.unique(xy):
result[i] = statistic(values[xy == i])
# Shape into a proper matrix
result = result.reshape(np.sort(nbin))
for i in np.arange(nbin.size):
j = ni.argsort()[i]
result = result.swapaxes(i, j)
ni[i], ni[j] = ni[j], ni[i]
# Remove outliers (indices 0 and -1 for each dimension).
core = D * [slice(1, -1)]
result = result[core]
if (result.shape != nbin - 2).any():
raise RuntimeError('Internal Shape Error')
return result, edges, xy

@ -13,6 +13,10 @@ _EPS = np.finfo(float).eps
# The largest [in magnitude] usable floating value.
_XMAX = np.finfo(float).machar.xmax
# The log of the largest usable floating value; useful for knowing
# when exp(something) will overflow
_LOGXMAX = np.log(_XMAX)
# The smallest [in magnitude] usable floating value.
_XMIN = np.finfo(float).machar.xmin
@ -21,4 +25,3 @@ _EULER = 0.577215664901532860606512090082402431042
# special.zeta(3, 1) Apery's constant
_ZETA3 = 1.202056903159594285399738161511449990765

File diff suppressed because it is too large Load Diff

@ -5,15 +5,15 @@
from __future__ import division, print_function, absolute_import
from scipy import special
from scipy.special import gammaln as gamln
from scipy.special import entr, gammaln as gamln
from scipy.misc import logsumexp
from numpy import floor, ceil, log, exp, sqrt, log1p, expm1, tanh, cosh, sinh
import numpy as np
import numpy.random as mtrand
from ._distn_infrastructure import (rv_discrete, _lazywhere, _ncx2_pdf,
_ncx2_cdf, get_distribution_names)
from ._distn_infrastructure import (
rv_discrete, _lazywhere, _ncx2_pdf, _ncx2_cdf, get_distribution_names)
class binom_gen(rv_discrete):
@ -31,11 +31,13 @@ class binom_gen(rv_discrete):
`binom` takes ``n`` and ``p`` as shape parameters.
%(after_notes)s
%(example)s
"""
def _rvs(self, n, p):
return mtrand.binomial(n, p, self._size)
return self._random_state.binomial(n, p, self._size)
def _argcheck(self, n, p):
self.b = n
@ -78,8 +80,7 @@ class binom_gen(rv_discrete):
def _entropy(self, n, p):
k = np.r_[0:n + 1]
vals = self._pmf(k, n, p)
h = -np.sum(special.xlogy(vals, vals), axis=0)
return h
return np.sum(entr(vals), axis=0)
binom = binom_gen(name='binom')
@ -99,6 +100,8 @@ class bernoulli_gen(binom_gen):
`bernoulli` takes ``p`` as shape parameter.
%(after_notes)s
%(example)s
"""
@ -127,8 +130,7 @@ class bernoulli_gen(binom_gen):
return binom._stats(1, p)
def _entropy(self, p):
h = -special.xlogy(p, p) - special.xlogy(1 - p, 1 - p)
return h
return entr(p) + entr(1-p)
bernoulli = bernoulli_gen(b=1, name='bernoulli')
@ -147,21 +149,23 @@ class nbinom_gen(rv_discrete):
`nbinom` takes ``n`` and ``p`` as shape parameters.
%(after_notes)s
%(example)s
"""
def _rvs(self, n, p):
return mtrand.negative_binomial(n, p, self._size)
return self._random_state.negative_binomial(n, p, self._size)
def _argcheck(self, n, p):
return (n >= 0) & (p >= 0) & (p <= 1)
return (n > 0) & (p >= 0) & (p <= 1)
def _pmf(self, x, n, p):
return exp(self._logpmf(x, n, p))
def _logpmf(self, x, n, p):
coeff = gamln(n + x) - gamln(x + 1) - gamln(n)
return coeff + special.xlogy(n, p) + special.xlog1py(x, -p)
coeff = gamln(n+x) - gamln(x+1) - gamln(n)
return coeff + n*log(p) + special.xlog1py(x, -p)
def _cdf(self, x, n, p):
k = floor(x)
@ -204,11 +208,13 @@ class geom_gen(rv_discrete):
`geom` takes ``p`` as shape parameter.
%(after_notes)s
%(example)s
"""
def _rvs(self, p):
return mtrand.geometric(p, size=self._size)
return self._random_state.geometric(p, size=self._size)
def _argcheck(self, p):
return (p <= 1) & (p >= 0)
@ -221,14 +227,14 @@ class geom_gen(rv_discrete):
def _cdf(self, x, p):
k = floor(x)
return -expm1(log1p(-p) * k)
return -expm1(log1p(-p)*k)
def _sf(self, x, p):
return np.exp(self._logsf(x, p))
def _logsf(self, x, p):
k = floor(x)
return k * log1p(-p)
return k*log1p(-p)
def _ppf(self, q, p):
vals = ceil(log1p(-q) / log1p(-p))
@ -262,6 +268,8 @@ class hypergeom_gen(rv_discrete):
pmf(k, M, n, N) = choose(n, k) * choose(M - n, N - k) / choose(M, N),
for max(0, N - (M-n)) <= k <= min(n, N)
%(after_notes)s
Examples
--------
>>> from scipy.stats import hypergeom
@ -297,10 +305,10 @@ class hypergeom_gen(rv_discrete):
"""
def _rvs(self, M, n, N):
return mtrand.hypergeometric(n, M-n, N, size=self._size)
return self._random_state.hypergeometric(n, M-n, N, size=self._size)
def _argcheck(self, M, n, N):
cond = rv_discrete._argcheck(self, M, n, N)
cond = (M > 0) & (n >= 0) & (N >= 0)
cond &= (n <= M) & (N <= M)
self.a = max(N-(M-n), 0)
self.b = min(n, N)
@ -338,8 +346,7 @@ class hypergeom_gen(rv_discrete):
def _entropy(self, M, n, N):
k = np.r_[N - (M - n):min(n, N) + 1]
vals = self.pmf(k, M, n, N)
h = -np.sum(special.xlogy(vals, vals), axis=0)
return h
return np.sum(entr(vals), axis=0)
def _sf(self, k, M, n, N):
"""More precise calculation, 1 - cdf doesn't cut it."""
@ -354,6 +361,17 @@ class hypergeom_gen(rv_discrete):
k2 = np.arange(quant + 1, draw + 1)
res.append(np.sum(self._pmf(k2, tot, good, draw)))
return np.asarray(res)
def _logsf(self, k, M, n, N):
"""
More precise calculation than log(sf)
"""
res = []
for quant, tot, good, draw in zip(k, M, n, N):
# Integration over probability mass function using logsumexp
k2 = np.arange(quant + 1, draw + 1)
res.append(logsumexp(self._logpmf(k2, tot, good, draw)))
return np.asarray(res)
hypergeom = hypergeom_gen(name='hypergeom')
@ -373,13 +391,15 @@ class logser_gen(rv_discrete):
`logser` takes ``p`` as shape parameter.
%(after_notes)s
%(example)s
"""
def _rvs(self, p):
# looks wrong for p>0.5, too few k=1
# trying to use generic is worse, no k=1 at all
return mtrand.logseries(p, size=self._size)
return self._random_state.logseries(p, size=self._size)
def _argcheck(self, p):
return (p > 0) & (p < 1)
@ -419,14 +439,21 @@ class poisson_gen(rv_discrete):
`poisson` takes ``mu`` as shape parameter.
%(after_notes)s
%(example)s
"""
# Override rv_discrete._argcheck to allow mu=0.
def _argcheck(self, mu):
return mu >= 0
def _rvs(self, mu):
return mtrand.poisson(mu, self._size)
return self._random_state.poisson(mu, self._size)
def _logpmf(self, k, mu):
Pk = k*log(mu)-gamln(k+1) - mu
Pk = special.xlogy(k, mu) - gamln(k + 1) - mu
return Pk
def _pmf(self, k, mu):
@ -449,9 +476,11 @@ class poisson_gen(rv_discrete):
def _stats(self, mu):
var = mu
tmp = np.asarray(mu)
g1 = sqrt(1.0 / tmp)
g2 = 1.0 / tmp
mu_nonzero = tmp > 0
g1 = _lazywhere(mu_nonzero, (tmp,), lambda x: sqrt(1.0/x), np.inf)
g2 = _lazywhere(mu_nonzero, (tmp,), lambda x: 1.0/x, np.inf)
return mu, var, g1, g2
poisson = poisson_gen(name="poisson", longname='A Poisson')
@ -470,6 +499,8 @@ class planck_gen(rv_discrete):
`planck` takes ``lambda_`` as shape parameter.
%(after_notes)s
%(example)s
"""
@ -487,7 +518,7 @@ class planck_gen(rv_discrete):
def _pmf(self, k, lambda_):
fact = -expm1(-lambda_)
return fact * exp(-lambda_ * k)
return fact*exp(-lambda_*k)
def _cdf(self, x, lambda_):
k = floor(x)
@ -528,12 +559,14 @@ class boltzmann_gen(rv_discrete):
`boltzmann` takes ``lambda_`` and ``N`` as shape parameters.
%(after_notes)s
%(example)s
"""
def _pmf(self, k, lambda_, N):
fact = (expm1(-lambda_)) / (expm1(-lambda_ * N))
return fact * exp(-lambda_ * k)
return fact*exp(-lambda_*k)
def _cdf(self, x, lambda_, N):
k = floor(x)
@ -559,7 +592,7 @@ class boltzmann_gen(rv_discrete):
g2 = g2 / trm2 / trm2
return mu, var, g1, g2
boltzmann = boltzmann_gen(name='boltzmann',
longname='A truncated discrete exponential ')
longname='A truncated discrete exponential ')
class randint_gen(rv_discrete):
@ -577,8 +610,7 @@ class randint_gen(rv_discrete):
`randint` takes ``low`` and ``high`` as shape parameters.
Note the difference to the numpy ``random_integers`` which
returns integers on a *closed* interval ``[low, high]``.
%(after_notes)s
%(example)s
@ -616,7 +648,7 @@ class randint_gen(rv_discrete):
If ``high`` is ``None``, then range is >=0 and < low
"""
return mtrand.randint(low, high, self._size)
return self._random_state.randint(low, high, self._size)
def _entropy(self, low, high):
return log(high - low)
@ -624,21 +656,6 @@ randint = randint_gen(name='randint', longname='A discrete uniform '
'(random integer)')
def harmonic(n, r):
return (1./n + special.polygamma(r-1, n)/special.gamma(r) +
special.zeta(r, 1))
def H(n):
"""Returns the n-th harmonic number.
http://en.wikipedia.org/wiki/Harmonic_number
"""
# Euler-Mascheroni constant
gamma = 0.57721566490153286060651209008240243104215933593992
return gamma + special.digamma(n+1)
# FIXME: problems sampling.
class zipf_gen(rv_discrete):
"""A Zipf discrete random variable.
@ -655,11 +672,13 @@ class zipf_gen(rv_discrete):
`zipf` takes ``a`` as shape parameter.
%(after_notes)s
%(example)s
"""
def _rvs(self, a):
return mtrand.zipf(a, size=self._size)
return self._random_state.zipf(a, size=self._size)
def _argcheck(self, a):
return a > 1
@ -691,6 +710,8 @@ class dlaplace_gen(rv_discrete):
`dlaplace` takes ``a`` as shape parameter.
%(after_notes)s
%(example)s
"""
@ -705,9 +726,8 @@ class dlaplace_gen(rv_discrete):
def _ppf(self, q, a):
const = 1 + exp(a)
vals = ceil(np.where(q < 1.0 / (1 + exp(-a)),
log(q*const) / a - 1,
-log((1-q) * const) / a))
vals = ceil(np.where(q < 1.0 / (1 + exp(-a)), log(q*const) / a - 1,
-log((1-q) * const) / a))
vals1 = vals - 1
return np.where(self._cdf(vals1, a) >= q, vals1, vals)
@ -746,25 +766,28 @@ class skellam_gen(rv_discrete):
`skellam` takes ``mu1`` and ``mu2`` as shape parameters.
%(after_notes)s
%(example)s
"""
def _rvs(self, mu1, mu2):
n = self._size
return mtrand.poisson(mu1, n) - mtrand.poisson(mu2, n)
return (self._random_state.poisson(mu1, n) -
self._random_state.poisson(mu2, n))
def _pmf(self, x, mu1, mu2):
px = np.where(x < 0,
_ncx2_pdf(2*mu2, 2*(1-x), 2*mu1)*2,
_ncx2_pdf(2*mu1, 2*(1+x), 2*mu2)*2)
_ncx2_pdf(2*mu2, 2*(1-x), 2*mu1)*2,
_ncx2_pdf(2*mu1, 2*(1+x), 2*mu2)*2)
# ncx2.pdf() returns nan's for extremely low probabilities
return px
def _cdf(self, x, mu1, mu2):
x = floor(x)
px = np.where(x < 0,
_ncx2_cdf(2*mu2, -2*x, 2*mu1),
1-_ncx2_cdf(2*mu1, 2*(x+1), 2*mu2))
_ncx2_cdf(2*mu2, -2*x, 2*mu1),
1-_ncx2_cdf(2*mu1, 2*(x+1), 2*mu2))
return px
def _stats(self, mu1, mu2):

File diff suppressed because it is too large Load Diff

@ -10,6 +10,7 @@ distcont = [
['betaprime', (5, 6)],
['bradford', (0.29891359763170633,)],
['burr', (10.5, 4.3)],
['burr12', (10, 4)],
['cauchy', ()],
['chi', (78,)],
['chi2', (55,)],
@ -18,6 +19,7 @@ distcont = [
['dweibull', (2.0685080649914673,)],
['erlang', (10,)],
['expon', ()],
['exponnorm', (1.5,)],
['exponpow', (2.697119160358469,)],
['exponweib', (2.8923945291034436, 1.9505288745913174)],
['f', (29, 18)],
@ -33,8 +35,11 @@ distcont = [
['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)],
['genextreme', (-0.1,)],
['gengamma', (4.4162385429431925, 3.1193091679242761)],
['gengamma', (4.4162385429431925, -3.1193091679242761)],
['genhalflogistic', (0.77274727809929322,)],
['genlogistic', (0.41192440799679475,)],
['gennorm', (1.2988442399460265,)],
['halfgennorm', (0.6748054997000371,)],
['genpareto', (0.1,)], # use case with finite moments
['gilbrat', ()],
['gompertz', (0.94743713075105251,)],
@ -80,6 +85,7 @@ distcont = [
['reciprocal', (0.0062309367010521255, 1.0062309367010522)],
['rice', (0.7749725210111873,)],
['semicircular', ()],
['skewnorm', (4.0,)],
['t', (2.7433514990818093,)],
['triang', (0.15785029824528218,)],
['truncexpon', (4.6907725456810478,)],
@ -113,4 +119,3 @@ distdiscrete = [
['skellam', (15, 8)],
['zipf', (6.5,)]
]

@ -1,884 +0,0 @@
#
# Author: Joris Vankerschaver 2013
#
from __future__ import division, print_function, absolute_import
import numpy as np
import scipy.linalg
from scipy.misc import doccer
from scipy.special import gammaln
__all__ = ['multivariate_normal', 'dirichlet']
_LOG_2PI = np.log(2 * np.pi)
def _process_parameters(dim, mean, cov):
"""
Infer dimensionality from mean or covariance matrix, ensure that
mean and covariance are full vector resp. matrix.
"""
# Try to infer dimensionality
if dim is None:
if mean is None:
if cov is None:
dim = 1
else:
cov = np.asarray(cov, dtype=float)
if cov.ndim < 2:
dim = 1
else:
dim = cov.shape[0]
else:
mean = np.asarray(mean, dtype=float)
dim = mean.size
else:
if not np.isscalar(dim):
raise ValueError("Dimension of random variable must be a scalar.")
# Check input sizes and return full arrays for mean and cov if necessary
if mean is None:
mean = np.zeros(dim)
mean = np.asarray(mean, dtype=float)
if cov is None:
cov = 1.0
cov = np.asarray(cov, dtype=float)
if dim == 1:
mean.shape = (1,)
cov.shape = (1, 1)
if mean.ndim != 1 or mean.shape[0] != dim:
raise ValueError("Array 'mean' must be a vector of length %d." % dim)
if cov.ndim == 0:
cov = cov * np.eye(dim)
elif cov.ndim == 1:
cov = np.diag(cov)
elif cov.ndim == 2 and cov.shape != (dim, dim):
rows, cols = cov.shape
if rows != cols:
msg = ("Array 'cov' must be square if it is two dimensional,"
" but cov.shape = %s." % str(cov.shape))
else:
msg = ("Dimension mismatch: array 'cov' is of shape %s,"
" but 'mean' is a vector of length %d.")
msg = msg % (str(cov.shape), len(mean))
raise ValueError(msg)
elif cov.ndim > 2:
raise ValueError("Array 'cov' must be at most two-dimensional,"
" but cov.ndim = %d" % cov.ndim)
return dim, mean, cov
def _process_quantiles(x, dim):
"""
Adjust quantiles array so that last axis labels the components of
each data point.
"""
x = np.asarray(x, dtype=float)
if x.ndim == 0:
x = x[np.newaxis]
elif x.ndim == 1:
if dim == 1:
x = x[:, np.newaxis]
else:
x = x[np.newaxis, :]
return x
def _squeeze_output(out):
"""
Remove single-dimensional entries from array and convert to scalar,
if necessary.
"""
out = out.squeeze()
if out.ndim == 0:
out = out[()]
return out
def _eigvalsh_to_eps(spectrum, cond=None, rcond=None):
"""
Determine which eigenvalues are "small" given the spectrum.
This is for compatibility across various linear algebra functions
that should agree about whether or not a Hermitian matrix is numerically
singular and what is its numerical matrix rank.
This is designed to be compatible with scipy.linalg.pinvh.
Parameters
----------
spectrum : 1d ndarray
Array of eigenvalues of a Hermitian matrix.
cond, rcond : float, optional
Cutoff for small eigenvalues.
Singular values smaller than rcond * largest_eigenvalue are
considered zero.
If None or -1, suitable machine precision is used.
Returns
-------
eps : float
Magnitude cutoff for numerical negligibility.
"""
if rcond is not None:
cond = rcond
if cond in [None, -1]:
t = spectrum.dtype.char.lower()
factor = {'f': 1E3, 'd': 1E6}
cond = factor[t] * np.finfo(t).eps
eps = cond * np.max(abs(spectrum))
return eps
def _pinv_1d(v, eps=1e-5):
"""
A helper function for computing the pseudoinverse.
Parameters
----------
v : iterable of numbers
This may be thought of as a vector of eigenvalues or singular values.
eps : float
Values with magnitude no greater than eps are considered negligible.
Returns
-------
v_pinv : 1d float ndarray
A vector of pseudo-inverted numbers.
"""
return np.array([0 if abs(x) <= eps else 1/x for x in v], dtype=float)
class _PSD(object):
"""
Compute coordinated functions of a symmetric positive semidefinite matrix.
This class addresses two issues. Firstly it allows the pseudoinverse,
the logarithm of the pseudo-determinant, and the rank of the matrix
to be computed using one call to eigh instead of three.
Secondly it allows these functions to be computed in a way
that gives mutually compatible results.
All of the functions are computed with a common understanding as to
which of the eigenvalues are to be considered negligibly small.
The functions are designed to coordinate with scipy.linalg.pinvh()
but not necessarily with np.linalg.det() or with np.linalg.matrix_rank().
Parameters
----------
M : 2d array-like
Symmetric positive semidefinite matrix.
cond, rcond : float, optional
Cutoff for small eigenvalues.
Singular values smaller than rcond * largest_eigenvalue are
considered zero.
If None or -1, suitable machine precision is used.
lower : bool, optional
Whether the pertinent array data is taken from the lower
or upper triangle of M. (Default: lower)
check_finite : bool, optional
Whether to check that the input matrices contain only finite
numbers. Disabling may give a performance gain, but may result
in problems (crashes, non-termination) if the inputs do contain
infinities or NaNs.
allow_singular : bool, optional
Whether to allow a singular matrix. (Default: True)
Notes
-----
The arguments are similar to those of scipy.linalg.pinvh().
"""
def __init__(self, M, cond=None, rcond=None, lower=True,
check_finite=True, allow_singular=True):
# Compute the symmetric eigendecomposition.
# Note that eigh takes care of array conversion, chkfinite,
# and assertion that the matrix is square.
s, u = scipy.linalg.eigh(M, lower=lower, check_finite=check_finite)
eps = _eigvalsh_to_eps(s, cond, rcond)
if np.min(s) < -eps:
raise ValueError('the input matrix must be positive semidefinite')
d = s[s > eps]
if len(d) < len(s) and not allow_singular:
raise np.linalg.LinAlgError('singular matrix')
s_pinv = _pinv_1d(s, eps)
U = np.multiply(u, np.sqrt(s_pinv))
# Initialize the eagerly precomputed attributes.
self.rank = len(d)
self.U = U
self.log_pdet = np.sum(np.log(d))
# Initialize an attribute to be lazily computed.
self._pinv = None
@property
def pinv(self):
if self._pinv is None:
self._pinv = np.dot(self.U, self.U.T)
return self._pinv
_doc_default_callparams = """\
mean : array_like, optional
Mean of the distribution (default zero)
cov : array_like, optional
Covariance matrix of the distribution (default one)
allow_singular : bool, optional
Whether to allow a singular covariance matrix. (Default: False)
"""
_doc_callparams_note = \
"""Setting the parameter `mean` to `None` is equivalent to having `mean`
be the zero-vector. The parameter `cov` can be a scalar, in which case
the covariance matrix is the identity times that value, a vector of
diagonal entries for the covariance matrix, or a two-dimensional
array_like.
"""
_doc_frozen_callparams = ""
_doc_frozen_callparams_note = \
"""See class definition for a detailed description of parameters."""
docdict_params = {
'_doc_default_callparams': _doc_default_callparams,
'_doc_callparams_note': _doc_callparams_note
}
docdict_noparams = {
'_doc_default_callparams': _doc_frozen_callparams,
'_doc_callparams_note': _doc_frozen_callparams_note
}
class multivariate_normal_gen(object):
r"""
A multivariate normal random variable.
The `mean` keyword specifies the mean. The `cov` keyword specifies the
covariance matrix.
Methods
-------
pdf(x, mean=None, cov=1, allow_singular=False)
Probability density function.
logpdf(x, mean=None, cov=1, allow_singular=False)
Log of the probability density function.
rvs(mean=None, cov=1, allow_singular=False, size=1)
Draw random samples from a multivariate normal distribution.
entropy()
Compute the differential entropy of the multivariate normal.
Parameters
----------
x : array_like
Quantiles, with the last axis of `x` denoting the components.
%(_doc_default_callparams)s
Alternatively, the object may be called (as a function) to fix the mean
and covariance parameters, returning a "frozen" multivariate normal
random variable:
rv = multivariate_normal(mean=None, cov=1, allow_singular=False)
- Frozen object with the same methods but holding the given
mean and covariance fixed.
Notes
-----
%(_doc_callparams_note)s
The covariance matrix `cov` must be a (symmetric) positive
semi-definite matrix. The determinant and inverse of `cov` are computed
as the pseudo-determinant and pseudo-inverse, respectively, so
that `cov` does not need to have full rank.
The probability density function for `multivariate_normal` is
.. math::
f(x) = \frac{1}{\sqrt{(2 \pi)^k \det \Sigma}} \exp\left( -\frac{1}{2} (x - \mu)^T \Sigma^{-1} (x - \mu) \right),
where :math:`\mu` is the mean, :math:`\Sigma` the covariance matrix,
and :math:`k` is the dimension of the space where :math:`x` takes values.
.. versionadded:: 0.14.0
Examples
--------
>>> import matplotlib.pyplot as plt
>>> from scipy.stats import multivariate_normal
>>> x = np.linspace(0, 5, 10, endpoint=False)
>>> y = multivariate_normal.pdf(x, mean=2.5, cov=0.5); y
array([ 0.00108914, 0.01033349, 0.05946514, 0.20755375, 0.43939129,
0.56418958, 0.43939129, 0.20755375, 0.05946514, 0.01033349])
>>> plt.plot(x, y)
The input quantiles can be any shape of array, as long as the last
axis labels the components. This allows us for instance to
display the frozen pdf for a non-isotropic random variable in 2D as
follows:
>>> x, y = np.mgrid[-1:1:.01, -1:1:.01]
>>> pos = np.empty(x.shape + (2,))
>>> pos[:, :, 0] = x; pos[:, :, 1] = y
>>> rv = multivariate_normal([0.5, -0.2], [[2.0, 0.3], [0.3, 0.5]])
>>> plt.contourf(x, y, rv.pdf(pos))
"""
def __init__(self):
self.__doc__ = doccer.docformat(self.__doc__, docdict_params)
def __call__(self, mean=None, cov=1, allow_singular=False):
"""
Create a frozen multivariate normal distribution.
See `multivariate_normal_frozen` for more information.
"""
return multivariate_normal_frozen(mean, cov,
allow_singular=allow_singular)
def _logpdf(self, x, mean, prec_U, log_det_cov, rank):
"""
Parameters
----------
x : ndarray
Points at which to evaluate the log of the probability
density function
mean : ndarray
Mean of the distribution
prec_U : ndarray
A decomposition such that np.dot(prec_U, prec_U.T)
is the precision matrix, i.e. inverse of the covariance matrix.
log_det_cov : float
Logarithm of the determinant of the covariance matrix
rank : int
Rank of the covariance matrix.
Notes
-----
As this function does no argument checking, it should not be
called directly; use 'logpdf' instead.
"""
dev = x - mean
maha = np.sum(np.square(np.dot(dev, prec_U)), axis=-1)
return -0.5 * (rank * _LOG_2PI + log_det_cov + maha)
def logpdf(self, x, mean, cov, allow_singular=False):
"""
Log of the multivariate normal probability density function.
Parameters
----------
x : array_like
Quantiles, with the last axis of `x` denoting the components.
%(_doc_default_callparams)s
Notes
-----
%(_doc_callparams_note)s
Returns
-------
pdf : ndarray
Log of the probability density function evaluated at `x`
"""
dim, mean, cov = _process_parameters(None, mean, cov)
x = _process_quantiles(x, dim)
psd = _PSD(cov, allow_singular=allow_singular)
out = self._logpdf(x, mean, psd.U, psd.log_pdet, psd.rank)
return _squeeze_output(out)
def pdf(self, x, mean, cov, allow_singular=False):
"""
Multivariate normal probability density function.
Parameters
----------
x : array_like
Quantiles, with the last axis of `x` denoting the components.
%(_doc_default_callparams)s
Notes
-----
%(_doc_callparams_note)s
Returns
-------
pdf : ndarray
Probability density function evaluated at `x`
"""
dim, mean, cov = _process_parameters(None, mean, cov)
x = _process_quantiles(x, dim)
psd = _PSD(cov, allow_singular=allow_singular)
out = np.exp(self._logpdf(x, mean, psd.U, psd.log_pdet, psd.rank))
return _squeeze_output(out)
def rvs(self, mean=None, cov=1, size=1):
"""
Draw random samples from a multivariate normal distribution.
Parameters
----------
%(_doc_default_callparams)s
size : integer, optional
Number of samples to draw (default 1).
Notes
-----
%(_doc_callparams_note)s
Returns
-------
rvs : ndarray or scalar
Random variates of size (`size`, `N`), where `N` is the
dimension of the random variable.
"""
dim, mean, cov = _process_parameters(None, mean, cov)
out = np.random.multivariate_normal(mean, cov, size)
return _squeeze_output(out)
def entropy(self, mean=None, cov=1):
"""
Compute the differential entropy of the multivariate normal.
Parameters
----------
%(_doc_default_callparams)s
Notes
-----
%(_doc_callparams_note)s
Returns
-------
h : scalar
Entropy of the multivariate normal distribution
"""
dim, mean, cov = _process_parameters(None, mean, cov)
return 0.5 * np.log(np.linalg.det(2 * np.pi * np.e * cov))
multivariate_normal = multivariate_normal_gen()
class multivariate_normal_frozen(object):
def __init__(self, mean=None, cov=1, allow_singular=False):
"""
Create a frozen multivariate normal distribution.
Parameters
----------
mean : array_like, optional
Mean of the distribution (default zero)
cov : array_like, optional
Covariance matrix of the distribution (default one)
allow_singular : bool, optional
If this flag is True then tolerate a singular
covariance matrix (default False).
Examples
--------
When called with the default parameters, this will create a 1D random
variable with mean 0 and covariance 1:
>>> from scipy.stats import multivariate_normal
>>> r = multivariate_normal()
>>> r.mean
array([ 0.])
>>> r.cov
array([[1.]])
"""
self.dim, self.mean, self.cov = _process_parameters(None, mean, cov)
self.cov_info = _PSD(self.cov, allow_singular=allow_singular)
self._mnorm = multivariate_normal_gen()
def logpdf(self, x):
x = _process_quantiles(x, self.dim)
out = self._mnorm._logpdf(x, self.mean, self.cov_info.U,
self.cov_info.log_pdet, self.cov_info.rank)
return _squeeze_output(out)
def pdf(self, x):
return np.exp(self.logpdf(x))
def rvs(self, size=1):
return self._mnorm.rvs(self.mean, self.cov, size)
def entropy(self):
"""
Computes the differential entropy of the multivariate normal.
Returns
-------
h : scalar
Entropy of the multivariate normal distribution
"""
log_pdet = self.cov_info.log_pdet
rank = self.cov_info.rank
return 0.5 * (rank * (_LOG_2PI + 1) + log_pdet)
# Set frozen generator docstrings from corresponding docstrings in
# multivariate_normal_gen and fill in default strings in class docstrings
for name in ['logpdf', 'pdf', 'rvs']:
method = multivariate_normal_gen.__dict__[name]
method_frozen = multivariate_normal_frozen.__dict__[name]
method_frozen.__doc__ = doccer.docformat(method.__doc__, docdict_noparams)
method.__doc__ = doccer.docformat(method.__doc__, docdict_params)
_dirichlet_doc_default_callparams = """\
alpha : array_like
The concentration parameters. The number of entries determines the
dimensionality of the distribution.
"""
_dirichlet_doc_frozen_callparams = ""
_dirichlet_doc_frozen_callparams_note = \
"""See class definition for a detailed description of parameters."""
dirichlet_docdict_params = {
'_dirichlet_doc_default_callparams': _dirichlet_doc_default_callparams,
}
dirichlet_docdict_noparams = {
'_dirichlet_doc_default_callparams': _dirichlet_doc_frozen_callparams,
}
def _dirichlet_check_parameters(alpha):
alpha = np.asarray(alpha)
if np.min(alpha) <= 0:
raise ValueError("All parameters must be greater than 0")
elif alpha.ndim != 1:
raise ValueError("Parameter vector 'a' must be one dimensional, " +
"but a.shape = %s." % str(alpha.shape))
return alpha
def _dirichlet_check_input(alpha, x):
x = np.asarray(x)
if x.shape[0] + 1 != alpha.shape[0] and x.shape[0] != alpha.shape[0]:
raise ValueError("Vector 'x' must have one entry less then the" +
" parameter vector 'a', but alpha.shape = " +
"%s and " % alpha.shape +
"x.shape = %s." % x.shape)
if x.shape[0] != alpha.shape[0]:
xk = np.array([1 - np.sum(x, 0)])
if xk.ndim == 1:
x = np.append(x, xk)
elif xk.ndim == 2:
x = np.vstack((x, xk))
else:
raise ValueError("The input must be one dimensional or a two "
"dimensional matrix containing the entries.")
if np.min(x) < 0:
raise ValueError("Each entry in 'x' must be greater or equal zero.")
if np.max(x) > 1:
raise ValueError("Each entry in 'x' must be smaller or equal one.")
if (np.abs(np.sum(x, 0) - 1.0) > 10e-10).any():
raise ValueError("The input vector 'x' must lie within the normal " +
"simplex. but sum(x)=%f." % np.sum(x, 0))
return x
def _lnB(alpha):
r"""
Internal helper function to compute the log of the useful quotient
.. math::
B(\alpha) = \frac{\prod_{i=1}{K}\Gamma(\alpha_i)}{\Gamma\left(\sum_{i=1}^{K}\alpha_i\right)}
Parameters
----------
%(_dirichlet_doc_default_callparams)s
Returns
-------
B : scalar
Helper quotient, internal use only
"""
return np.sum(gammaln(alpha)) - gammaln(np.sum(alpha))
class dirichlet_gen(object):
r"""
A Dirichlet random variable.
The `alpha` keyword specifies the concentration parameters of the
distribution.
.. versionadded:: 0.15.0
Methods
-------
pdf(x, alpha)
Probability density function.
logpdf(x, alpha)
Log of the probability density function.
rvs(alpha, size=1)
Draw random samples from a Dirichlet distribution.
mean(alpha)
The mean of the Dirichlet distribution
var(alpha)
The variance of the Dirichlet distribution
entropy(alpha)
Compute the differential entropy of the multivariate normal.
Parameters
----------
x : array_like
Quantiles, with the last axis of `x` denoting the components.
%(_dirichlet_doc_default_callparams)s
Alternatively, the object may be called (as a function) to fix
concentration parameters, returning a "frozen" Dirichlet
random variable:
rv = dirichlet(alpha)
- Frozen object with the same methods but holding the given
concentration parameters fixed.
Notes
-----
Each :math:`\alpha` entry must be positive. The distribution has only
support on the simplex defined by
.. math::
\sum_{i=1}^{K} x_i \le 1
The probability density function for `dirichlet` is
.. math::
f(x) = \frac{1}{\mathrm{B}(\boldsymbol\alpha)} \prod_{i=1}^K x_i^{\alpha_i - 1}
where
.. math::
\mathrm{B}(\boldsymbol\alpha) = \frac{\prod_{i=1}^K \Gamma(\alpha_i)}{\Gamma\bigl(\sum_{i=1}^K \alpha_i\bigr)}
and :math:`\boldsymbol\alpha=(\alpha_1,\ldots,\alpha_K)`, the
concentration parameters and :math:`K` is the dimension of the space
where :math:`x` takes values.
"""
def __init__(self):
self.__doc__ = doccer.docformat(self.__doc__, dirichlet_docdict_params)
def __call__(self, alpha):
return dirichlet_frozen(alpha)
def _logpdf(self, x, alpha):
"""
Parameters
----------
x : ndarray
Points at which to evaluate the log of the probability
density function
%(_dirichlet_doc_default_callparams)s
Notes
-----
As this function does no argument checking, it should not be
called directly; use 'logpdf' instead.
"""
lnB = _lnB(alpha)
return - lnB + np.sum((np.log(x.T) * (alpha - 1)).T, 0)
def logpdf(self, x, alpha):
"""
Log of the Dirichlet probability density function.
Parameters
----------
x : array_like
Quantiles, with the last axis of `x` denoting the components.
%(_dirichlet_doc_default_callparams)s
Returns
-------
pdf : ndarray
Log of the probability density function evaluated at `x`
"""
alpha = _dirichlet_check_parameters(alpha)
x = _dirichlet_check_input(alpha, x)
out = self._logpdf(x, alpha)
return _squeeze_output(out)
def pdf(self, x, alpha):
"""
The Dirichlet probability density function.
Parameters
----------
x : array_like
Quantiles, with the last axis of `x` denoting the components.
%(_dirichlet_doc_default_callparams)s
Returns
-------
pdf : ndarray
The probability density function evaluated at `x`
"""
alpha = _dirichlet_check_parameters(alpha)
x = _dirichlet_check_input(alpha, x)
out = np.exp(self._logpdf(x, alpha))
return _squeeze_output(out)
def mean(self, alpha):
"""
Compute the mean of the dirichlet distribution.
Parameters
----------
%(_dirichlet_doc_default_callparams)s
Returns
-------
mu : scalar
Mean of the Dirichlet distribution
"""
alpha = _dirichlet_check_parameters(alpha)
out = alpha / (np.sum(alpha))
return _squeeze_output(out)
def var(self, alpha):
"""
Compute the variance of the dirichlet distribution.
Parameters
----------
%(_dirichlet_doc_default_callparams)s
Returns
-------
v : scalar
Variance of the Dirichlet distribution
"""
alpha = _dirichlet_check_parameters(alpha)
alpha0 = np.sum(alpha)
out = (alpha * (alpha0 - alpha)) / ((alpha0 * alpha0) * (alpha0 + 1))
return out
def entropy(self, alpha):
"""
Compute the differential entropy of the dirichlet distribution.
Parameters
----------
%(_dirichlet_doc_default_callparams)s
Returns
-------
h : scalar
Entropy of the Dirichlet distribution
"""
alpha = _dirichlet_check_parameters(alpha)
alpha0 = np.sum(alpha)
lnB = _lnB(alpha)
K = alpha.shape[0]
out = lnB + (alpha0 - K) * scipy.special.psi(alpha0) - np.sum(
(alpha - 1) * scipy.special.psi(alpha))
return _squeeze_output(out)
def rvs(self, alpha, size=1):
"""
Draw random samples from a Dirichlet distribution.
Parameters
----------
%(_dirichlet_doc_default_callparams)s
size : integer, optional
Number of samples to draw (default 1).
Returns
-------
rvs : ndarray or scalar
Random variates of size (`size`, `N`), where `N` is the
dimension of the random variable.
"""
alpha = _dirichlet_check_parameters(alpha)
return np.random.dirichlet(alpha, size=size)
dirichlet = dirichlet_gen()
class dirichlet_frozen(object):
def __init__(self, alpha):
self.alpha = _dirichlet_check_parameters(alpha)
self._dirichlet = dirichlet_gen()
def logpdf(self, x):
return self._dirichlet.logpdf(x, self.alpha)
def pdf(self, x):
return self._dirichlet.pdf(x, self.alpha)
def mean(self):
return self._dirichlet.mean(self.alpha)
def var(self):
return self._dirichlet.var(self.alpha)
def entropy(self):
return self._dirichlet.entropy(self.alpha)
def rvs(self, size=1):
return self._dirichlet.rvs(self.alpha, size)
# Set frozen generator docstrings from corresponding docstrings in
# multivariate_normal_gen and fill in default strings in class docstrings
for name in ['logpdf', 'pdf', 'rvs', 'mean', 'var', 'entropy']:
method = dirichlet_gen.__dict__[name]
method_frozen = dirichlet_frozen.__dict__[name]
method_frozen.__doc__ = doccer.docformat(
method.__doc__, dirichlet_docdict_noparams)
method.__doc__ = doccer.docformat(method.__doc__, dirichlet_docdict_params)

@ -1,201 +0,0 @@
from __future__ import division, print_function, absolute_import
import numpy as np
from numpy import poly1d
from scipy.special import beta
# The following code was used to generate the Pade coefficients for the
# Tukey Lambda variance function. Version 0.17 of mpmath was used.
#---------------------------------------------------------------------------
# import mpmath as mp
#
# mp.mp.dps = 60
#
# one = mp.mpf(1)
# two = mp.mpf(2)
#
# def mpvar(lam):
# if lam == 0:
# v = mp.pi**2 / three
# else:
# v = (two / lam**2) * (one / (one + two*lam) -
# mp.beta(lam + one, lam + one))
# return v
#
# t = mp.taylor(mpvar, 0, 8)
# p, q = mp.pade(t, 4, 4)
# print "p =", [mp.fp.mpf(c) for c in p]
# print "q =", [mp.fp.mpf(c) for c in q]
#---------------------------------------------------------------------------
# Pade coefficients for the Tukey Lambda variance function.
_tukeylambda_var_pc = [3.289868133696453, 0.7306125098871127,
-0.5370742306855439, 0.17292046290190008,
-0.02371146284628187]
_tukeylambda_var_qc = [1.0, 3.683605511659861, 4.184152498888124,
1.7660926747377275, 0.2643989311168465]
# numpy.poly1d instances for the numerator and denominator of the
# Pade approximation to the Tukey Lambda variance.
_tukeylambda_var_p = poly1d(_tukeylambda_var_pc[::-1])
_tukeylambda_var_q = poly1d(_tukeylambda_var_qc[::-1])
def tukeylambda_variance(lam):
"""Variance of the Tukey Lambda distribution.
Parameters
----------
lam : array_like
The lambda values at which to compute the variance.
Returns
-------
v : ndarray
The variance. For lam < -0.5, the variance is not defined, so
np.nan is returned. For lam = 0.5, np.inf is returned.
Notes
-----
In an interval around lambda=0, this function uses the [4,4] Pade
approximation to compute the variance. Otherwise it uses the standard
formula (http://en.wikipedia.org/wiki/Tukey_lambda_distribution). The
Pade approximation is used because the standard formula has a removable
discontinuity at lambda = 0, and does not produce accurate numerical
results near lambda = 0.
"""
lam = np.asarray(lam)
shp = lam.shape
lam = np.atleast_1d(lam).astype(np.float64)
# For absolute values of lam less than threshold, use the Pade
# approximation.
threshold = 0.075
# Play games with masks to implement the conditional evaluation of
# the distribution.
# lambda < -0.5: var = nan
low_mask = lam < -0.5
# lambda == -0.5: var = inf
neghalf_mask = lam == -0.5
# abs(lambda) < threshold: use Pade approximation
small_mask = np.abs(lam) < threshold
# else the "regular" case: use the explicit formula.
reg_mask = ~(low_mask | neghalf_mask | small_mask)
# Get the 'lam' values for the cases where they are needed.
small = lam[small_mask]
reg = lam[reg_mask]
# Compute the function for each case.
v = np.empty_like(lam)
v[low_mask] = np.nan
v[neghalf_mask] = np.inf
if small.size > 0:
# Use the Pade approximation near lambda = 0.
v[small_mask] = _tukeylambda_var_p(small) / _tukeylambda_var_q(small)
if reg.size > 0:
v[reg_mask] = (2.0 / reg**2) * (1.0 / (1.0 + 2 * reg) -
beta(reg + 1, reg + 1))
v.shape = shp
return v
# The following code was used to generate the Pade coefficients for the
# Tukey Lambda kurtosis function. Version 0.17 of mpmath was used.
#---------------------------------------------------------------------------
# import mpmath as mp
#
# mp.mp.dps = 60
#
# one = mp.mpf(1)
# two = mp.mpf(2)
# three = mp.mpf(3)
# four = mp.mpf(4)
#
# def mpkurt(lam):
# if lam == 0:
# k = mp.mpf(6)/5
# else:
# numer = (one/(four*lam+one) - four*mp.beta(three*lam+one, lam+one) +
# three*mp.beta(two*lam+one, two*lam+one))
# denom = two*(one/(two*lam+one) - mp.beta(lam+one,lam+one))**2
# k = numer / denom - three
# return k
#
# # There is a bug in mpmath 0.17: when we use the 'method' keyword of the
# # taylor function and we request a degree 9 Taylor polynomial, we actually
# # get degree 8.
# t = mp.taylor(mpkurt, 0, 9, method='quad', radius=0.01)
# t = [mp.chop(c, tol=1e-15) for c in t]
# p, q = mp.pade(t, 4, 4)
# print "p =", [mp.fp.mpf(c) for c in p]
# print "q =", [mp.fp.mpf(c) for c in q]
#---------------------------------------------------------------------------
# Pade coefficients for the Tukey Lambda kurtosis function.
_tukeylambda_kurt_pc = [1.2, -5.853465139719495, -22.653447381131077,
0.20601184383406815, 4.59796302262789]
_tukeylambda_kurt_qc = [1.0, 7.171149192233599, 12.96663094361842,
0.43075235247853005, -2.789746758009912]
# numpy.poly1d instances for the numerator and denominator of the
# Pade approximation to the Tukey Lambda kurtosis.
_tukeylambda_kurt_p = poly1d(_tukeylambda_kurt_pc[::-1])
_tukeylambda_kurt_q = poly1d(_tukeylambda_kurt_qc[::-1])
def tukeylambda_kurtosis(lam):
"""Kurtosis of the Tukey Lambda distribution.
Parameters
----------
lam : array_like
The lambda values at which to compute the variance.
Returns
-------
v : ndarray
The variance. For lam < -0.25, the variance is not defined, so
np.nan is returned. For lam = 0.25, np.inf is returned.
"""
lam = np.asarray(lam)
shp = lam.shape
lam = np.atleast_1d(lam).astype(np.float64)
# For absolute values of lam less than threshold, use the Pade
# approximation.
threshold = 0.055
# Use masks to implement the conditional evaluation of the kurtosis.
# lambda < -0.25: kurtosis = nan
low_mask = lam < -0.25
# lambda == -0.25: kurtosis = inf
negqrtr_mask = lam == -0.25
# lambda near 0: use Pade approximation
small_mask = np.abs(lam) < threshold
# else the "regular" case: use the explicit formula.
reg_mask = ~(low_mask | negqrtr_mask | small_mask)
# Get the 'lam' values for the cases where they are needed.
small = lam[small_mask]
reg = lam[reg_mask]
# Compute the function for each case.
k = np.empty_like(lam)
k[low_mask] = np.nan
k[negqrtr_mask] = np.inf
if small.size > 0:
k[small_mask] = _tukeylambda_kurt_p(small) / _tukeylambda_kurt_q(small)
if reg.size > 0:
numer = (1.0 / (4 * reg + 1) - 4 * beta(3 * reg + 1, reg + 1) +
3 * beta(2 * reg + 1, 2 * reg + 1))
denom = 2 * (1.0/(2 * reg + 1) - beta(reg + 1, reg + 1))**2
k[reg_mask] = numer / denom - 3
# The return value will be a numpy array; resetting the shape ensures that
# if `lam` was a scalar, the return value is a 0-d array.
k.shape = shp
return k

@ -1,271 +0,0 @@
"""Some functions for working with contingency tables (i.e. cross tabulations).
"""
from __future__ import division, print_function, absolute_import
from functools import reduce
import numpy as np
from .stats import power_divergence
__all__ = ['margins', 'expected_freq', 'chi2_contingency']
def margins(a):
"""Return a list of the marginal sums of the array `a`.
Parameters
----------
a : ndarray
The array for which to compute the marginal sums.
Returns
-------
margsums : list of ndarrays
A list of length `a.ndim`. `margsums[k]` is the result
of summing `a` over all axes except `k`; it has the same
number of dimensions as `a`, but the length of each axis
except axis `k` will be 1.
Examples
--------
>>> a = np.arange(12).reshape(2, 6)
>>> a
array([[ 0, 1, 2, 3, 4, 5],
[ 6, 7, 8, 9, 10, 11]])
>>> m0, m1 = margins(a)
>>> m0
array([[15],
[51]])
>>> m1
array([[ 6, 8, 10, 12, 14, 16]])
>>> b = np.arange(24).reshape(2,3,4)
>>> m0, m1, m2 = margins(b)
>>> m0
array([[[ 66]],
[[210]]])
>>> m1
array([[[ 60],
[ 92],
[124]]])
>>> m2
array([[[60, 66, 72, 78]]])
"""
margsums = []
ranged = list(range(a.ndim))
for k in ranged:
marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
margsums.append(marg)
return margsums
def expected_freq(observed):
"""
Compute the expected frequencies from a contingency table.
Given an n-dimensional contingency table of observed frequencies,
compute the expected frequencies for the table based on the marginal
sums under the assumption that the groups associated with each
dimension are independent.
Parameters
----------
observed : array_like
The table of observed frequencies. (While this function can handle
a 1-D array, that case is trivial. Generally `observed` is at
least 2-D.)
Returns
-------
expected : ndarray of float64
The expected frequencies, based on the marginal sums of the table.
Same shape as `observed`.
Examples
--------
>>> observed = np.array([[10, 10, 20],[20, 20, 20]])
>>> expected_freq(observed)
array([[ 12., 12., 16.],
[ 18., 18., 24.]])
"""
# Typically `observed` is an integer array. If `observed` has a large
# number of dimensions or holds large values, some of the following
# computations may overflow, so we first switch to floating point.
observed = np.asarray(observed, dtype=np.float64)
# Create a list of the marginal sums.
margsums = margins(observed)
# Create the array of expected frequencies. The shapes of the
# marginal sums returned by apply_over_axes() are just what we
# need for broadcasting in the following product.
d = observed.ndim
expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
return expected
def chi2_contingency(observed, correction=True, lambda_=None):
"""Chi-square test of independence of variables in a contingency table.
This function computes the chi-square statistic and p-value for the
hypothesis test of independence of the observed frequencies in the
contingency table [1]_ `observed`. The expected frequencies are computed
based on the marginal sums under the assumption of independence; see
`scipy.stats.contingency.expected_freq`. The number of degrees of
freedom is (expressed using numpy functions and attributes)::
dof = observed.size - sum(observed.shape) + observed.ndim - 1
Parameters
----------
observed : array_like
The contingency table. The table contains the observed frequencies
(i.e. number of occurrences) in each category. In the two-dimensional
case, the table is often described as an "R x C table".
correction : bool, optional
If True, *and* the degrees of freedom is 1, apply Yates' correction
for continuity. The effect of the correction is to adjust each
observed value by 0.5 towards the corresponding expected value.
lambda_ : float or str, optional.
By default, the statistic computed in this test is Pearson's
chi-squared statistic [2]_. `lambda_` allows a statistic from the
Cressie-Read power divergence family [3]_ to be used instead. See
`power_divergence` for details.
Returns
-------
chi2 : float
The test statistic.
p : float
The p-value of the test
dof : int
Degrees of freedom
expected : ndarray, same shape as `observed`
The expected frequencies, based on the marginal sums of the table.
See Also
--------
contingency.expected_freq
fisher_exact
chisquare
power_divergence
Notes
-----
An often quoted guideline for the validity of this calculation is that
the test should be used only if the observed and expected frequency in
each cell is at least 5.
This is a test for the independence of different categories of a
population. The test is only meaningful when the dimension of
`observed` is two or more. Applying the test to a one-dimensional
table will always result in `expected` equal to `observed` and a
chi-square statistic equal to 0.
This function does not handle masked arrays, because the calculation
does not make sense with missing values.
Like stats.chisquare, this function computes a chi-square statistic;
the convenience this function provides is to figure out the expected
frequencies and degrees of freedom from the given contingency table.
If these were already known, and if the Yates' correction was not
required, one could use stats.chisquare. That is, if one calls::
chi2, p, dof, ex = chi2_contingency(obs, correction=False)
then the following is true::
(chi2, p) == stats.chisquare(obs.ravel(), f_exp=ex.ravel(),
ddof=obs.size - 1 - dof)
The `lambda_` argument was added in version 0.13.0 of scipy.
References
----------
.. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
.. [2] "Pearson's chi-squared test",
http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
.. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
pp. 440-464.
Examples
--------
A two-way example (2 x 3):
>>> obs = np.array([[10, 10, 20], [20, 20, 20]])
>>> chi2_contingency(obs)
(2.7777777777777777,
0.24935220877729619,
2,
array([[ 12., 12., 16.],
[ 18., 18., 24.]]))
Perform the test using the log-likelihood ratio (i.e. the "G-test")
instead of Pearson's chi-squared statistic.
>>> g, p, dof, expctd = chi2_contingency(obs, lambda_="log-likelihood")
>>> g, p
(2.7688587616781319, 0.25046668010954165)
A four-way example (2 x 2 x 2 x 2):
>>> obs = np.array(
... [[[[12, 17],
... [11, 16]],
... [[11, 12],
... [15, 16]]],
... [[[23, 15],
... [30, 22]],
... [[14, 17],
... [15, 16]]]])
>>> chi2_contingency(obs)
(8.7584514426741897,
0.64417725029295503,
11,
array([[[[ 14.15462386, 14.15462386],
[ 16.49423111, 16.49423111]],
[[ 11.2461395 , 11.2461395 ],
[ 13.10500554, 13.10500554]]],
[[[ 19.5591166 , 19.5591166 ],
[ 22.79202844, 22.79202844]],
[[ 15.54012004, 15.54012004],
[ 18.10873492, 18.10873492]]]]))
"""
observed = np.asarray(observed)
if np.any(observed < 0):
raise ValueError("All values in `observed` must be nonnegative.")
if observed.size == 0:
raise ValueError("No data; `observed` has size 0.")
expected = expected_freq(observed)
if np.any(expected == 0):
# Include one of the positions where expected is zero in
# the exception message.
zeropos = list(zip(*np.where(expected == 0)))[0]
raise ValueError("The internally computed table of expected "
"frequencies has a zero element at %s." % (zeropos,))
# The degrees of freedom
dof = expected.size - sum(expected.shape) + expected.ndim - 1
if dof == 0:
# Degenerate case; this occurs when `observed` is 1D (or, more
# generally, when it has only one nontrivial dimension). In this
# case, we also have observed == expected, so chi2 is 0.
chi2 = 0.0
p = 1.0
else:
if dof == 1 and correction:
# Adjust `observed` according to Yates' correction for continuity.
observed = observed + 0.5 * np.sign(expected - observed)
chi2, p = power_divergence(observed, expected,
ddof=observed.size - 1 - dof, axis=None,
lambda_=lambda_)
return chi2, p, dof, expected

@ -22,4 +22,3 @@ __all__ = ['entropy', 'rv_discrete', 'rv_continuous']
# Add only the distribution names, not the *_gen names.
__all__ += _continuous_distns._distn_names
__all__ += _discrete_distns._distn_names

@ -11,20 +11,19 @@ from __future__ import division, absolute_import
import warnings
from wafo.plotbackend import plotbackend
from wafo.misc import ecross, findcross
import numdifftools # @UnresolvedImport
from wafo.misc import ecross, findcross, argsreduce
from wafo.stats._util import check_random_state
from wafo.stats._constants import _EPS, _XMAX
from wafo.stats._distn_infrastructure import rv_frozen
from scipy._lib.six import string_types
import numdifftools as nd # @UnresolvedImport
from scipy import special
from scipy.linalg import pinv2
from scipy import optimize
import numpy
import numpy as np
from numpy import alltrue, arange, ravel, sum, zeros, log, sqrt, exp
from numpy import (
atleast_1d, any, asarray, nan, pi, # reshape, #repeat, product, ndarray,
isfinite)
from numpy import (alltrue, arange, ravel, zeros, log, sqrt, exp,
atleast_1d, any, asarray, nan, pi, isfinite)
from numpy import flatnonzero as nonzero
@ -48,97 +47,6 @@ def norm_ppf(q):
return special.ndtri(q)
# Frozen RV class
class rv_frozen(object):
''' Frozen continous or discrete 1D Random Variable object (RV)
Methods
-------
rvs(size=1)
Random variates.
pdf(x)
Probability density function.
cdf(x)
Cumulative density function.
sf(x)
Survival function (1-cdf --- sometimes more accurate).
ppf(q)
Percent point function (inverse of cdf --- percentiles).
isf(q)
Inverse survival function (inverse of sf).
stats(moments='mv')
Mean('m'), variance('v'), skew('s'), and/or kurtosis('k').
entropy()
(Differential) entropy of the RV.
'''
def __init__(self, dist, *args, **kwds):
self.dist = dist
args, loc, scale = dist._parse_args(*args, **kwds)
if len(args) == dist.numargs - 2: #
# if isinstance(dist, rv_continuous):
self.par = args + (loc, scale)
else: # rv_discrete
self.par = args + (loc,)
def pdf(self, x):
''' Probability density function at x of the given RV.'''
return self.dist.pdf(x, *self.par)
def cdf(self, x):
'''Cumulative distribution function at x of the given RV.'''
return self.dist.cdf(x, *self.par)
def ppf(self, q):
'''Percent point function (inverse of cdf) at q of the given RV.'''
return self.dist.ppf(q, *self.par)
def isf(self, q):
'''Inverse survival function at q of the given RV.'''
return self.dist.isf(q, *self.par)
def rvs(self, size=None):
'''Random variates of given type.'''
kwds = dict(size=size)
return self.dist.rvs(*self.par, **kwds)
def sf(self, x):
'''Survival function (1-cdf) at x of the given RV.'''
return self.dist.sf(x, *self.par)
def stats(self, moments='mv'):
''' Some statistics of the given RV'''
kwds = dict(moments=moments)
return self.dist.stats(*self.par, **kwds)
def median(self):
return self.dist.median(*self.par)
def mean(self):
return self.dist.mean(*self.par)
def var(self):
return self.dist.var(*self.par)
def std(self):
return self.dist.std(*self.par)
def moment(self, n):
par1 = self.par[:self.dist.numargs]
return self.dist.moment(n, *par1)
def entropy(self):
return self.dist.entropy(*self.par)
def pmf(self, k):
'''Probability mass function at k of the given RV'''
return self.dist.pmf(k, *self.par)
def interval(self, alpha):
return self.dist.interval(alpha, *self.par)
# internal class to profile parameters of a given distribution
class Profile(object):
@ -230,7 +138,7 @@ class Profile(object):
def __init__(self, fit_dist, **kwds):
try:
i0 = (1 - numpy.isfinite(fit_dist.par_fix)).argmax()
i0 = (1 - np.isfinite(fit_dist.par_fix)).argmax()
except:
i0 = 0
self.fit_dist = fit_dist
@ -259,7 +167,7 @@ class Profile(object):
if fit_dist.par_fix is None:
isnotfixed = np.ones(fit_dist.par.shape, dtype=bool)
else:
isnotfixed = 1 - numpy.isfinite(fit_dist.par_fix)
isnotfixed = 1 - np.isfinite(fit_dist.par_fix)
self.i_notfixed = nonzero(isnotfixed)
@ -341,7 +249,7 @@ class Profile(object):
def _set_profile(self, phatfree0, p_opt):
pvec = self._get_pvec(phatfree0, p_opt)
self.data = numpy.ones_like(pvec) * nan
self.data = np.ones_like(pvec) * nan
k1 = (pvec >= p_opt).argmax()
for size, step in ((-1, -1), (pvec.size, 1)):
@ -358,14 +266,14 @@ class Profile(object):
def _prettify_profile(self):
pvec = self.args
ix = nonzero(numpy.isfinite(pvec))
ix = nonzero(np.isfinite(pvec))
self.data = self.data[ix]
self.args = pvec[ix]
cond = self.data == -numpy.inf
cond = self.data == -np.inf
if any(cond):
ind, = cond.nonzero()
self.data.put(ind, floatinfo.min / 2.0)
ind1 = numpy.where(ind == 0, ind, ind - 1)
ind1 = np.where(ind == 0, ind, ind - 1)
cl = self.alpha_cross_level - self.alpha_Lrange / 2.0
t0 = ecross(self.args, self.data, ind1, cl)
self.data.put(ind, cl)
@ -379,29 +287,29 @@ class Profile(object):
phatv = self._par
if self.profile_x:
gradfun = numdifftools.Gradient(self._myinvfun)
gradfun = nd.Gradient(self._myinvfun)
else:
gradfun = numdifftools.Gradient(self._myprbfun)
gradfun = nd.Gradient(self._myprbfun)
drl = gradfun(phatv[self.i_notfixed])
pcov = self.fit_dist.par_cov[i_notfixed, :][:, i_notfixed]
pvar = sum(numpy.dot(drl, pcov) * drl)
pvar = np.sum(np.dot(drl, pcov) * drl)
return pvar
def _get_pvec(self, phatfree0, p_opt):
''' return proper interval for the variable to profile
'''
linspace = numpy.linspace
linspace = np.linspace
if self.pmin is None or self.pmax is None:
pvar = self._get_variance()
if pvar <= 1e-5 or numpy.isnan(pvar):
if pvar <= 1e-5 or np.isnan(pvar):
pvar = max(abs(p_opt) * 0.5, 0.5)
p_crit = (-norm_ppf(self.alpha / 2.0) *
sqrt(numpy.ravel(pvar)) * 1.5)
sqrt(np.ravel(pvar)) * 1.5)
if self.pmin is None:
self.pmin = self._search_pmin(phatfree0,
p_opt - 5.0 * p_crit, p_opt)
@ -412,13 +320,13 @@ class Profile(object):
p_opt + 5.0 * p_crit, p_opt)
p_crit_up = (self.pmax - p_opt) / 5
N4 = numpy.floor(self.N / 4.0)
N4 = np.floor(self.N / 4.0)
pvec1 = linspace(self.pmin, p_opt - p_crit_low, N4 + 1)
pvec2 = linspace(
p_opt - p_crit_low, p_opt + p_crit_up, self.N - 2 * N4)
pvec3 = linspace(p_opt + p_crit_up, self.pmax, N4 + 1)
pvec = numpy.unique(numpy.hstack((pvec1, p_opt, pvec2, pvec3)))
pvec = np.unique(np.hstack((pvec1, p_opt, pvec2, pvec3)))
else:
pvec = linspace(self.pmin, self.pmax, self.N)
@ -701,12 +609,12 @@ class FitDistribution(rv_frozen):
m_variables = ['method', 'alpha', 'par_fix', 'search', 'copydata']
m_defaults = ['ml', 0.05, None, True, True]
for (name, val) in zip(m_variables, m_defaults):
setattr(self, name, kwds.get(name, val))
setattr(self, name, kwds.pop(name, val))
if self.method.lower()[:].startswith('mps'):
self._fitfun = dist.nlogps
self._fitfun = self._nlogps
else:
self._fitfun = dist.nnlf
self._fitfun = self._nnlf
self.data = ravel(data)
if self.copydata:
@ -714,6 +622,7 @@ class FitDistribution(rv_frozen):
self.data.sort()
par, fixedn = self._fit(*args, **kwds)
# super(FitDistribution, self).__init__(dist, *par)
self.par = arr(par)
somefixed = len(fixedn) > 0
if somefixed:
@ -729,13 +638,13 @@ class FitDistribution(rv_frozen):
self._compute_cov()
# Set confidence interval for parameters
pvar = numpy.diag(self.par_cov)
pvar = np.diag(self.par_cov)
zcrit = -norm_ppf(self.alpha / 2.0)
self.par_lower = self.par - zcrit * sqrt(pvar)
self.par_upper = self.par + zcrit * sqrt(pvar)
self.LLmax = -dist.nnlf(self.par, self.data)
self.LPSmax = -dist.nlogps(self.par, self.data)
self.LLmax = -self._nnlf(self.par, self.data)
self.LPSmax = -self._nlogps(self.par, self.data)
self.pvalue = self._pvalue(self.par, self.data, unknown_numpar=numpar)
def __repr__(self):
@ -747,17 +656,30 @@ class FitDistribution(rv_frozen):
return ''.join(t)
def _reduce_func(self, args, kwds):
# First of all, convert fshapes params to fnum: eg for stats.beta,
# shapes='a, b'. To fix `a`, can specify either `f1` or `fa`.
# Convert the latter into the former.
if self.shapes:
shapes = self.shapes.replace(',', ' ').split()
for j, s in enumerate(shapes):
val = kwds.pop('f' + s, None) or kwds.pop('fix_' + s, None)
if val is not None:
key = 'f%d' % j
if key in kwds:
raise ValueError("Duplicate entry for %s." % key)
else:
kwds[key] = val
args = list(args)
Nargs = len(args)
fixedn = []
index = range(Nargs)
names = ['f%d' % n for n in range(Nargs - 2)] + ['floc', 'fscale']
x0 = args[:]
for n, key in zip(index[::-1], names[::-1]):
x0 = []
for n, key in enumerate(names):
if key in kwds:
fixedn.append(n)
args[n] = kwds[key]
del x0[n]
args[n] = kwds.pop(key)
else:
x0.append(args[n])
fitfun = self._fitfun
@ -765,7 +687,7 @@ class FitDistribution(rv_frozen):
func = fitfun
restore = None
else:
if len(fixedn) == len(index):
if len(fixedn) == Nargs:
raise ValueError("All parameters fixed. " +
"There is nothing to optimize.")
@ -786,6 +708,134 @@ class FitDistribution(rv_frozen):
return x0, func, restore, args, fixedn
@staticmethod
def _hessian(nnlf, theta, data, eps=None):
''' approximate hessian of nnlf where theta are the parameters
(including loc and scale)
'''
if eps is None:
eps = (_EPS) ** 0.4
num_par = len(theta)
# pab 07.01.2001: Always choose the stepsize h so that
# it is an exactly representable number.
# This is important when calculating numerical derivatives and is
# accomplished by the following.
delta = (eps + 2.0) - 2.0
delta2 = delta ** 2.0
# Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with
# 1/(d^2 L(theta|x)/dtheta^2)
# using central differences
LL = nnlf(theta, data)
H = zeros((num_par, num_par)) # Hessian matrix
theta = tuple(theta)
for ix in xrange(num_par):
sparam = list(theta)
sparam[ix] = theta[ix] + delta
fp = nnlf(sparam, data)
sparam[ix] = theta[ix] - delta
fm = nnlf(sparam, data)
H[ix, ix] = (fp - 2 * LL + fm) / delta2
for iy in range(ix + 1, num_par):
sparam[ix] = theta[ix] + delta
sparam[iy] = theta[iy] + delta
fpp = nnlf(sparam, data)
sparam[iy] = theta[iy] - delta
fpm = nnlf(sparam, data)
sparam[ix] = theta[ix] - delta
fmm = nnlf(sparam, data)
sparam[iy] = theta[iy] + delta
fmp = nnlf(sparam, data)
H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2)
H[iy, ix] = H[ix, iy]
sparam[iy] = theta[iy]
return -H
def _nnlf(self, theta, x):
return self.dist._penalized_nnlf(theta, x)
def _nlogps(self, theta, x):
""" Moran's negative log Product Spacings statistic
where theta are the parameters (including loc and scale)
Note the data in x must be sorted
References
-----------
R. C. H. Cheng; N. A. K. Amin (1983)
"Estimating Parameters in Continuous Univariate Distributions with a
Shifted Origin.",
Journal of the Royal Statistical Society. Series B (Methodological),
Vol. 45, No. 3. (1983), pp. 394-403.
R. C. H. Cheng; M. A. Stephens (1989)
"A Goodness-Of-Fit Test Using Moran's Statistic with Estimated
Parameters", Biometrika, 76, 2, pp 385-392
Wong, T.S.T. and Li, W.K. (2006)
"A note on the estimation of extreme value distributions using maximum
product of spacings.",
IMS Lecture Notes Monograph Series 2006, Vol. 52, pp. 272-283
"""
n = 2 if self._rv_continous else 1
try:
loc = theta[-n]
scale = theta[-1]
args = tuple(theta[:-n])
except IndexError:
raise ValueError("Not enough input arguments.")
if not self._rv_continous:
scale = 1
if not self._argcheck(*args) or scale <= 0:
return np.inf
dist = self.dist
x = asarray((x - loc) / scale)
cond0 = (x <= dist.a) | (dist.b <= x)
Nbad = np.sum(cond0)
if Nbad > 0:
x = argsreduce(~cond0, x)[0]
lowertail = True
if lowertail:
prb = np.hstack((0.0, dist.cdf(x, *args), 1.0))
dprb = np.diff(prb)
else:
prb = np.hstack((1.0, dist.sf(x, *args), 0.0))
dprb = -np.diff(prb)
logD = log(dprb)
dx = np.diff(x, axis=0)
tie = (dx == 0)
if any(tie):
# TODO : implement this method for treating ties in data:
# Assume measuring error is delta. Then compute
# yL = F(xi-delta,theta)
# yU = F(xi+delta,theta)
# and replace
# logDj = log((yU-yL)/(r-1)) for j = i+1,i+2,...i+r-1
# The following is OK when only minimization of T is wanted
i_tie, = np.nonzero(tie)
tiedata = x[i_tie]
logD[i_tie + 1] = log(dist._pdf(tiedata, *args)) - log(scale)
finiteD = np.isfinite(logD)
nonfiniteD = 1 - finiteD
Nbad += np.sum(nonfiniteD, axis=0)
if Nbad > 0:
T = -np.sum(logD[finiteD], axis=0) + 100.0 * log(_XMAX) * Nbad
else:
T = -np.sum(logD, axis=0)
return T
def _fit(self, *args, **kwds):
dist = self.dist
@ -799,15 +849,14 @@ class FitDistribution(rv_frozen):
# get distribution specific starting locations
start = dist._fitstart(data)
args += start[Narg:-2]
loc = kwds.get('loc', start[-2])
scale = kwds.get('scale', start[-1])
loc = kwds.pop('loc', start[-2])
scale = kwds.pop('scale', start[-1])
args += (loc, scale)
x0, func, restore, args, fixedn = self._reduce_func(args, kwds)
if self.search:
optimizer = kwds.get('optimizer', optimize.fmin)
optimizer = kwds.pop('optimizer', optimize.fmin)
# convert string to function in scipy.optimize
if (not callable(optimizer) and
isinstance(optimizer, (str, unicode))):
if not callable(optimizer) and isinstance(optimizer, string_types):
if not optimizer.startswith('fmin_'):
optimizer = "fmin_" + optimizer
if optimizer == 'fmin_':
@ -816,7 +865,9 @@ class FitDistribution(rv_frozen):
optimizer = getattr(optimize, optimizer)
except AttributeError:
raise ValueError("%s is not a valid optimizer" % optimizer)
# by now kwds must be empty, since everybody took what they needed
if kwds:
raise TypeError("Unknown arguments: %s." % kwds)
vals = optimizer(func, x0, args=(ravel(data),), disp=0)
vals = tuple(vals)
else:
@ -829,8 +880,7 @@ class FitDistribution(rv_frozen):
'''Compute covariance
'''
somefixed = (self.par_fix is not None) and any(isfinite(self.par_fix))
# H1 = numpy.asmatrix(self.dist.hessian_nnlf(self.par, self.data))
H = numpy.asmatrix(self.dist.hessian_nlogps(self.par, self.data))
H = np.asmatrix(self._hessian(self._fitfun, self.par, self.data))
self.H = H
try:
if somefixed:
@ -1034,7 +1084,7 @@ class FitDistribution(rv_frozen):
# yy[0,0] = 0.0 # pdf
yy[:, 0] = 0.0 # histogram
yy.shape = (-1,)
yy = numpy.hstack((yy, 0.0))
yy = np.hstack((yy, 0.0))
return xx, yy
def _get_empirical_pdf(self):
@ -1110,7 +1160,7 @@ class FitDistribution(rv_frozen):
Note: the data in x must be sorted
'''
dx = numpy.diff(x, axis=0)
dx = np.diff(x, axis=0)
tie = (dx == 0)
if any(tie):
warnings.warn(

@ -1,541 +0,0 @@
#-------------------------------------------------------------------------------
#
# Define classes for (uni/multi)-variate kernel density estimation.
#
# Currently, only Gaussian kernels are implemented.
#
# Written by: Robert Kern
#
# Date: 2004-08-09
#
# Modified: 2005-02-10 by Robert Kern.
# Contributed to Scipy
# 2005-10-07 by Robert Kern.
# Some fixes to match the new scipy_core
#
# Copyright 2004-2005 by Enthought, Inc.
#
#-------------------------------------------------------------------------------
from __future__ import division, print_function, absolute_import
# Standard library imports.
import warnings
# Scipy imports.
from scipy._lib.six import callable, string_types
from scipy import linalg, special
from numpy import atleast_2d, reshape, zeros, newaxis, dot, exp, pi, sqrt, \
ravel, power, atleast_1d, squeeze, sum, transpose
import numpy as np
from numpy.random import randint, multivariate_normal
# Local imports.
from . import mvn
__all__ = ['gaussian_kde']
class gaussian_kde(object):
"""Representation of a kernel-density estimate using Gaussian kernels.
Kernel density estimation is a way to estimate the probability density
function (PDF) of a random variable in a non-parametric way.
`gaussian_kde` works for both uni-variate and multi-variate data. It
includes automatic bandwidth determination. The estimation works best for
a unimodal distribution; bimodal or multi-modal distributions tend to be
oversmoothed.
Parameters
----------
dataset : array_like
Datapoints to estimate from. In case of univariate data this is a 1-D
array, otherwise a 2-D array with shape (# of dims, # of data).
bw_method : str, scalar or callable, optional
The method used to calculate the estimator bandwidth. This can be
'scott', 'silverman', a scalar constant or a callable. If a scalar,
this will be used directly as `kde.factor`. If a callable, it should
take a `gaussian_kde` instance as only parameter and return a scalar.
If None (default), 'scott' is used. See Notes for more details.
Attributes
----------
dataset : ndarray
The dataset with which `gaussian_kde` was initialized.
d : int
Number of dimensions.
n : int
Number of datapoints.
factor : float
The bandwidth factor, obtained from `kde.covariance_factor`, with which
the covariance matrix is multiplied.
covariance : ndarray
The covariance matrix of `dataset`, scaled by the calculated bandwidth
(`kde.factor`).
inv_cov : ndarray
The inverse of `covariance`.
Methods
-------
kde.evaluate(points) : ndarray
Evaluate the estimated pdf on a provided set of points.
kde(points) : ndarray
Same as kde.evaluate(points)
kde.integrate_gaussian(mean, cov) : float
Multiply pdf with a specified Gaussian and integrate over the whole
domain.
kde.integrate_box_1d(low, high) : float
Integrate pdf (1D only) between two bounds.
kde.integrate_box(low_bounds, high_bounds) : float
Integrate pdf over a rectangular space between low_bounds and
high_bounds.
kde.integrate_kde(other_kde) : float
Integrate two kernel density estimates multiplied together.
kde.pdf(points) : ndarray
Alias for ``kde.evaluate(points)``.
kde.logpdf(points) : ndarray
Equivalent to ``np.log(kde.evaluate(points))``.
kde.resample(size=None) : ndarray
Randomly sample a dataset from the estimated pdf.
kde.set_bandwidth(bw_method='scott') : None
Computes the bandwidth, i.e. the coefficient that multiplies the data
covariance matrix to obtain the kernel covariance matrix.
.. versionadded:: 0.11.0
kde.covariance_factor : float
Computes the coefficient (`kde.factor`) that multiplies the data
covariance matrix to obtain the kernel covariance matrix.
The default is `scotts_factor`. A subclass can overwrite this method
to provide a different method, or set it through a call to
`kde.set_bandwidth`.
Notes
-----
Bandwidth selection strongly influences the estimate obtained from the KDE
(much more so than the actual shape of the kernel). Bandwidth selection
can be done by a "rule of thumb", by cross-validation, by "plug-in
methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
uses a rule of thumb, the default is Scott's Rule.
Scott's Rule [1]_, implemented as `scotts_factor`, is::
n**(-1./(d+4)),
with ``n`` the number of data points and ``d`` the number of dimensions.
Silverman's Rule [2]_, implemented as `silverman_factor`, is::
(n * (d + 2) / 4.)**(-1. / (d + 4)).
Good general descriptions of kernel density estimation can be found in [1]_
and [2]_, the mathematics for this multi-dimensional implementation can be
found in [1]_.
References
----------
.. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
Visualization", John Wiley & Sons, New York, Chicester, 1992.
.. [2] B.W. Silverman, "Density Estimation for Statistics and Data
Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
Chapman and Hall, London, 1986.
.. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
.. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
conditional density estimation", Computational Statistics & Data
Analysis, Vol. 36, pp. 279-298, 2001.
Examples
--------
Generate some random two-dimensional data:
>>> from scipy import stats
>>> def measure(n):
>>> "Measurement model, return two coupled measurements."
>>> m1 = np.random.normal(size=n)
>>> m2 = np.random.normal(scale=0.5, size=n)
>>> return m1+m2, m1-m2
>>> m1, m2 = measure(2000)
>>> xmin = m1.min()
>>> xmax = m1.max()
>>> ymin = m2.min()
>>> ymax = m2.max()
Perform a kernel density estimate on the data:
>>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
>>> positions = np.vstack([X.ravel(), Y.ravel()])
>>> values = np.vstack([m1, m2])
>>> kernel = stats.gaussian_kde(values)
>>> Z = np.reshape(kernel(positions).T, X.shape)
Plot the results:
>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
... extent=[xmin, xmax, ymin, ymax])
>>> ax.plot(m1, m2, 'k.', markersize=2)
>>> ax.set_xlim([xmin, xmax])
>>> ax.set_ylim([ymin, ymax])
>>> plt.show()
"""
def __init__(self, dataset, bw_method=None):
self.dataset = atleast_2d(dataset)
if not self.dataset.size > 1:
raise ValueError("`dataset` input should have multiple elements.")
self.d, self.n = self.dataset.shape
self.set_bandwidth(bw_method=bw_method)
def evaluate(self, points):
"""Evaluate the estimated pdf on a set of points.
Parameters
----------
points : (# of dimensions, # of points)-array
Alternatively, a (# of dimensions,) vector can be passed in and
treated as a single point.
Returns
-------
values : (# of points,)-array
The values at each point.
Raises
------
ValueError : if the dimensionality of the input points is different than
the dimensionality of the KDE.
"""
points = atleast_2d(points)
d, m = points.shape
if d != self.d:
if d == 1 and m == self.d:
# points was passed in as a row vector
points = reshape(points, (self.d, 1))
m = 1
else:
msg = "points have dimension %s, dataset has dimension %s" % (d,
self.d)
raise ValueError(msg)
result = zeros((m,), dtype=np.float)
if m >= self.n:
# there are more points than data, so loop over data
for i in range(self.n):
diff = self.dataset[:, i, newaxis] - points
tdiff = dot(self.inv_cov, diff)
energy = sum(diff*tdiff,axis=0) / 2.0
result = result + exp(-energy)
else:
# loop over points
for i in range(m):
diff = self.dataset - points[:, i, newaxis]
tdiff = dot(self.inv_cov, diff)
energy = sum(diff * tdiff, axis=0) / 2.0
result[i] = sum(exp(-energy), axis=0)
result = result / self._norm_factor
return result
__call__ = evaluate
def integrate_gaussian(self, mean, cov):
"""
Multiply estimated density by a multivariate Gaussian and integrate
over the whole space.
Parameters
----------
mean : aray_like
A 1-D array, specifying the mean of the Gaussian.
cov : array_like
A 2-D array, specifying the covariance matrix of the Gaussian.
Returns
-------
result : scalar
The value of the integral.
Raises
------
ValueError :
If the mean or covariance of the input Gaussian differs from
the KDE's dimensionality.
"""
mean = atleast_1d(squeeze(mean))
cov = atleast_2d(cov)
if mean.shape != (self.d,):
raise ValueError("mean does not have dimension %s" % self.d)
if cov.shape != (self.d, self.d):
raise ValueError("covariance does not have dimension %s" % self.d)
# make mean a column vector
mean = mean[:, newaxis]
sum_cov = self.covariance + cov
diff = self.dataset - mean
tdiff = dot(linalg.inv(sum_cov), diff)
energies = sum(diff * tdiff, axis=0) / 2.0
result = sum(exp(-energies), axis=0) / sqrt(linalg.det(2 * pi *
sum_cov)) / self.n
return result
def integrate_box_1d(self, low, high):
"""
Computes the integral of a 1D pdf between two bounds.
Parameters
----------
low : scalar
Lower bound of integration.
high : scalar
Upper bound of integration.
Returns
-------
value : scalar
The result of the integral.
Raises
------
ValueError
If the KDE is over more than one dimension.
"""
if self.d != 1:
raise ValueError("integrate_box_1d() only handles 1D pdfs")
stdev = ravel(sqrt(self.covariance))[0]
normalized_low = ravel((low - self.dataset) / stdev)
normalized_high = ravel((high - self.dataset) / stdev)
value = np.mean(special.ndtr(normalized_high) -
special.ndtr(normalized_low))
return value
def integrate_box(self, low_bounds, high_bounds, maxpts=None):
"""Computes the integral of a pdf over a rectangular interval.
Parameters
----------
low_bounds : array_like
A 1-D array containing the lower bounds of integration.
high_bounds : array_like
A 1-D array containing the upper bounds of integration.
maxpts : int, optional
The maximum number of points to use for integration.
Returns
-------
value : scalar
The result of the integral.
"""
if maxpts is not None:
extra_kwds = {'maxpts': maxpts}
else:
extra_kwds = {}
value, inform = mvn.mvnun(low_bounds, high_bounds, self.dataset,
self.covariance, **extra_kwds)
if inform:
msg = ('An integral in mvn.mvnun requires more points than %s' %
(self.d * 1000))
warnings.warn(msg)
return value
def integrate_kde(self, other):
"""
Computes the integral of the product of this kernel density estimate
with another.
Parameters
----------
other : gaussian_kde instance
The other kde.
Returns
-------
value : scalar
The result of the integral.
Raises
------
ValueError
If the KDEs have different dimensionality.
"""
if other.d != self.d:
raise ValueError("KDEs are not the same dimensionality")
# we want to iterate over the smallest number of points
if other.n < self.n:
small = other
large = self
else:
small = self
large = other
sum_cov = small.covariance + large.covariance
sum_cov_chol = linalg.cho_factor(sum_cov)
result = 0.0
for i in range(small.n):
mean = small.dataset[:, i, newaxis]
diff = large.dataset - mean
tdiff = linalg.cho_solve(sum_cov_chol, diff)
energies = sum(diff * tdiff, axis=0) / 2.0
result += sum(exp(-energies), axis=0)
result /= sqrt(linalg.det(2 * pi * sum_cov)) * large.n * small.n
return result
def resample(self, size=None):
"""
Randomly sample a dataset from the estimated pdf.
Parameters
----------
size : int, optional
The number of samples to draw. If not provided, then the size is
the same as the underlying dataset.
Returns
-------
resample : (self.d, `size`) ndarray
The sampled dataset.
"""
if size is None:
size = self.n
norm = transpose(multivariate_normal(zeros((self.d,), float),
self.covariance, size=size))
indices = randint(0, self.n, size=size)
means = self.dataset[:, indices]
return means + norm
def scotts_factor(self):
return power(self.n, -1./(self.d+4))
def silverman_factor(self):
return power(self.n*(self.d+2.0)/4.0, -1./(self.d+4))
# Default method to calculate bandwidth, can be overwritten by subclass
covariance_factor = scotts_factor
def set_bandwidth(self, bw_method=None):
"""Compute the estimator bandwidth with given method.
The new bandwidth calculated after a call to `set_bandwidth` is used
for subsequent evaluations of the estimated density.
Parameters
----------
bw_method : str, scalar or callable, optional
The method used to calculate the estimator bandwidth. This can be
'scott', 'silverman', a scalar constant or a callable. If a
scalar, this will be used directly as `kde.factor`. If a callable,
it should take a `gaussian_kde` instance as only parameter and
return a scalar. If None (default), nothing happens; the current
`kde.covariance_factor` method is kept.
Notes
-----
.. versionadded:: 0.11
Examples
--------
>>> x1 = np.array([-7, -5, 1, 4, 5.])
>>> kde = stats.gaussian_kde(x1)
>>> xs = np.linspace(-10, 10, num=50)
>>> y1 = kde(xs)
>>> kde.set_bandwidth(bw_method='silverman')
>>> y2 = kde(xs)
>>> kde.set_bandwidth(bw_method=kde.factor / 3.)
>>> y3 = kde(xs)
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.plot(x1, np.ones(x1.shape) / (4. * x1.size), 'bo',
... label='Data points (rescaled)')
>>> ax.plot(xs, y1, label='Scott (default)')
>>> ax.plot(xs, y2, label='Silverman')
>>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
>>> ax.legend()
>>> plt.show()
"""
if bw_method is None:
pass
elif bw_method == 'scott':
self.covariance_factor = self.scotts_factor
elif bw_method == 'silverman':
self.covariance_factor = self.silverman_factor
elif np.isscalar(bw_method) and not isinstance(bw_method, string_types):
self._bw_method = 'use constant'
self.covariance_factor = lambda: bw_method
elif callable(bw_method):
self._bw_method = bw_method
self.covariance_factor = lambda: self._bw_method(self)
else:
msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
"or a callable."
raise ValueError(msg)
self._compute_covariance()
def _compute_covariance(self):
"""Computes the covariance matrix for each Gaussian kernel using
covariance_factor().
"""
self.factor = self.covariance_factor()
# Cache covariance and inverse covariance of the data
if not hasattr(self, '_data_inv_cov'):
self._data_covariance = atleast_2d(np.cov(self.dataset, rowvar=1,
bias=False))
self._data_inv_cov = linalg.inv(self._data_covariance)
self.covariance = self._data_covariance * self.factor**2
self.inv_cov = self._data_inv_cov / self.factor**2
self._norm_factor = sqrt(linalg.det(2*pi*self.covariance)) * self.n
def pdf(self, x):
"""
Evaluate the estimated pdf on a provided set of points.
Notes
-----
This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
docstring for more details.
"""
return self.evaluate(x)
def logpdf(self, x):
"""
Evaluate the log of the estimated pdf on a provided set of points.
Notes
-----
See `gaussian_kde.evaluate` for more details; this method simply
returns ``np.log(gaussian_kde.evaluate(x))``.
"""
return np.log(self.evaluate(x))

@ -1,15 +0,0 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 06 16:02:47 2011
@author: pab
"""
import numpy as np
import wafo.kdetools as wk
n = 100
x = np.sort(5*np.random.rand(1,n)-2.5, axis=-1).ravel()
y = (np.cos(x)>2*np.random.rand(n, 1)-1).ravel()
kreg = wk.KRegression(x,y)
f = kreg(output='plotobj', title='Kernel regression', plotflag=1)
f.plot()

@ -1,13 +0,0 @@
from numpy import asarray, ndarray, ones, nan #, reshape, repeat, product
def valarray(shape, value=nan, typecode=None):
"""Return an array of all value.
"""
#out = reshape(repeat([value],product(shape,axis=0),axis=0),shape)
out = ones(shape, dtype=bool) * value
if typecode is not None:
out = out.astype(typecode)
if not isinstance(out, ndarray):
out = asarray(out)
return out

File diff suppressed because it is too large Load Diff

@ -1,79 +0,0 @@
"""
===================================================================
Statistical functions for masked arrays (:mod:`scipy.stats.mstats`)
===================================================================
.. currentmodule:: scipy.stats.mstats
This module contains a large number of statistical functions that can
be used with masked arrays.
Most of these functions are similar to those in scipy.stats but might
have small differences in the API or in the algorithm used. Since this
is a relatively new package, some API changes are still possible.
.. autosummary::
:toctree: generated/
argstoarray
betai
chisquare
count_tied_groups
describe
f_oneway
f_value_wilks_lambda
find_repeats
friedmanchisquare
kendalltau
kendalltau_seasonal
kruskalwallis
ks_twosamp
kurtosis
kurtosistest
linregress
mannwhitneyu
plotting_positions
mode
moment
mquantiles
msign
normaltest
obrientransform
pearsonr
plotting_positions
pointbiserialr
rankdata
scoreatpercentile
sem
signaltonoise
skew
skewtest
spearmanr
theilslopes
threshold
tmax
tmean
tmin
trim
trima
trimboth
trimmed_stde
trimr
trimtail
tsem
ttest_onesamp
ttest_ind
ttest_onesamp
ttest_rel
tvar
variation
winsorize
zmap
zscore
"""
from __future__ import division, print_function, absolute_import
from .mstats_basic import *
from .mstats_extras import *
from scipy.stats import gmean, hmean

File diff suppressed because it is too large Load Diff

@ -1,451 +0,0 @@
"""
Additional statistics functions with support for masked arrays.
"""
# Original author (2007): Pierre GF Gerard-Marchant
from __future__ import division, print_function, absolute_import
__all__ = ['compare_medians_ms',
'hdquantiles', 'hdmedian', 'hdquantiles_sd',
'idealfourths',
'median_cihs','mjci','mquantiles_cimj',
'rsh',
'trimmed_mean_ci',]
import numpy as np
from numpy import float_, int_, ndarray
import numpy.ma as ma
from numpy.ma import MaskedArray
from . import mstats_basic as mstats
from scipy.stats.distributions import norm, beta, t, binom
def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
"""
Computes quantile estimates with the Harrell-Davis method.
The quantile estimates are calculated as a weighted linear combination
of order statistics.
Parameters
----------
data : array_like
Data array.
prob : sequence
Sequence of quantiles to compute.
axis : int
Axis along which to compute the quantiles. If None, use a flattened
array.
var : boolean
Whether to return the variance of the estimate.
Returns
-------
hdquantiles : MaskedArray
A (p,) array of quantiles (if `var` is False), or a (2,p) array of
quantiles and variances (if `var` is True), where ``p`` is the
number of quantiles.
"""
def _hd_1D(data,prob,var):
"Computes the HD quantiles for a 1D array. Returns nan for invalid data."
xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
# Don't use length here, in case we have a numpy scalar
n = xsorted.size
hd = np.empty((2,len(prob)), float_)
if n < 2:
hd.flat = np.nan
if var:
return hd
return hd[0]
v = np.arange(n+1) / float(n)
betacdf = beta.cdf
for (i,p) in enumerate(prob):
_w = betacdf(v, (n+1)*p, (n+1)*(1-p))
w = _w[1:] - _w[:-1]
hd_mean = np.dot(w, xsorted)
hd[0,i] = hd_mean
#
hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
#
hd[0, prob == 0] = xsorted[0]
hd[0, prob == 1] = xsorted[-1]
if var:
hd[1, prob == 0] = hd[1, prob == 1] = np.nan
return hd
return hd[0]
# Initialization & checks
data = ma.array(data, copy=False, dtype=float_)
p = np.array(prob, copy=False, ndmin=1)
# Computes quantiles along axis (or globally)
if (axis is None) or (data.ndim == 1):
result = _hd_1D(data, p, var)
else:
if data.ndim > 2:
raise ValueError("Array 'data' must be at most two dimensional, "
"but got data.ndim = %d" % data.ndim)
result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
return ma.fix_invalid(result, copy=False)
def hdmedian(data, axis=-1, var=False):
"""
Returns the Harrell-Davis estimate of the median along the given axis.
Parameters
----------
data : ndarray
Data array.
axis : int
Axis along which to compute the quantiles. If None, use a flattened
array.
var : boolean
Whether to return the variance of the estimate.
"""
result = hdquantiles(data,[0.5], axis=axis, var=var)
return result.squeeze()
def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
"""
The standard error of the Harrell-Davis quantile estimates by jackknife.
Parameters
----------
data : array_like
Data array.
prob : sequence
Sequence of quantiles to compute.
axis : int
Axis along which to compute the quantiles. If None, use a flattened
array.
Returns
-------
hdquantiles_sd : MaskedArray
Standard error of the Harrell-Davis quantile estimates.
"""
def _hdsd_1D(data,prob):
"Computes the std error for 1D arrays."
xsorted = np.sort(data.compressed())
n = len(xsorted)
#.........
hdsd = np.empty(len(prob), float_)
if n < 2:
hdsd.flat = np.nan
vv = np.arange(n) / float(n-1)
betacdf = beta.cdf
for (i,p) in enumerate(prob):
_w = betacdf(vv, (n+1)*p, (n+1)*(1-p))
w = _w[1:] - _w[:-1]
mx_ = np.fromiter([np.dot(w,xsorted[np.r_[list(range(0,k)),
list(range(k+1,n))].astype(int_)])
for k in range(n)], dtype=float_)
mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1)
hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n))
return hdsd
# Initialization & checks
data = ma.array(data, copy=False, dtype=float_)
p = np.array(prob, copy=False, ndmin=1)
# Computes quantiles along axis (or globally)
if (axis is None):
result = _hdsd_1D(data, p)
else:
if data.ndim > 2:
raise ValueError("Array 'data' must be at most two dimensional, "
"but got data.ndim = %d" % data.ndim)
result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
return ma.fix_invalid(result, copy=False).ravel()
def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
alpha=0.05, axis=None):
"""
Selected confidence interval of the trimmed mean along the given axis.
Parameters
----------
data : array_like
Input data.
limits : {None, tuple}, optional
None or a two item tuple.
Tuple of the percentages to cut on each side of the array, with respect
to the number of unmasked data, as floats between 0. and 1. If ``n``
is the number of unmasked data before trimming, then
(``n * limits[0]``)th smallest data and (``n * limits[1]``)th
largest data are masked. The total number of unmasked data after
trimming is ``n * (1. - sum(limits))``.
The value of one limit can be set to None to indicate an open interval.
Defaults to (0.2, 0.2).
inclusive : (2,) tuple of boolean, optional
If relative==False, tuple indicating whether values exactly equal to
the absolute limits are allowed.
If relative==True, tuple indicating whether the number of data being
masked on each side should be rounded (True) or truncated (False).
Defaults to (True, True).
alpha : float, optional
Confidence level of the intervals.
Defaults to 0.05.
axis : int, optional
Axis along which to cut. If None, uses a flattened version of `data`.
Defaults to None.
Returns
-------
trimmed_mean_ci : (2,) ndarray
The lower and upper confidence intervals of the trimmed data.
"""
data = ma.array(data, copy=False)
trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
tmean = trimmed.mean(axis)
tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
df = trimmed.count(axis) - 1
tppf = t.ppf(1-alpha/2.,df)
return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
def mjci(data, prob=[0.25,0.5,0.75], axis=None):
"""
Returns the Maritz-Jarrett estimators of the standard error of selected
experimental quantiles of the data.
Parameters
----------
data: ndarray
Data array.
prob: sequence
Sequence of quantiles to compute.
axis : int
Axis along which to compute the quantiles. If None, use a flattened
array.
"""
def _mjci_1D(data, p):
data = np.sort(data.compressed())
n = data.size
prob = (np.array(p) * n + 0.5).astype(int_)
betacdf = beta.cdf
mj = np.empty(len(prob), float_)
x = np.arange(1,n+1, dtype=float_) / n
y = x - 1./n
for (i,m) in enumerate(prob):
(m1,m2) = (m-1, n-m)
W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
C1 = np.dot(W,data)
C2 = np.dot(W,data**2)
mj[i] = np.sqrt(C2 - C1**2)
return mj
data = ma.array(data, copy=False)
if data.ndim > 2:
raise ValueError("Array 'data' must be at most two dimensional, "
"but got data.ndim = %d" % data.ndim)
p = np.array(prob, copy=False, ndmin=1)
# Computes quantiles along axis (or globally)
if (axis is None):
return _mjci_1D(data, p)
else:
return ma.apply_along_axis(_mjci_1D, axis, data, p)
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
"""
Computes the alpha confidence interval for the selected quantiles of the
data, with Maritz-Jarrett estimators.
Parameters
----------
data : ndarray
Data array.
prob : sequence
Sequence of quantiles to compute.
alpha : float
Confidence level of the intervals.
axis : integer
Axis along which to compute the quantiles.
If None, use a flattened array.
"""
alpha = min(alpha, 1-alpha)
z = norm.ppf(1-alpha/2.)
xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
smj = mjci(data, prob, axis=axis)
return (xq - z * smj, xq + z * smj)
def median_cihs(data, alpha=0.05, axis=None):
"""
Computes the alpha-level confidence interval for the median of the data.
Uses the Hettmasperger-Sheather method.
Parameters
----------
data : array_like
Input data. Masked values are discarded. The input should be 1D only,
or `axis` should be set to None.
alpha : float
Confidence level of the intervals.
axis : integer
Axis along which to compute the quantiles. If None, use a flattened
array.
Returns
-------
median_cihs :
Alpha level confidence interval.
"""
def _cihs_1D(data, alpha):
data = np.sort(data.compressed())
n = len(data)
alpha = min(alpha, 1-alpha)
k = int(binom._ppf(alpha/2., n, 0.5))
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
if gk < 1-alpha:
k -= 1
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
I = (gk - 1 + alpha)/(gk - gkk)
lambd = (n-k) * I / float(k + (n-2*k)*I)
lims = (lambd*data[k] + (1-lambd)*data[k-1],
lambd*data[n-k-1] + (1-lambd)*data[n-k])
return lims
data = ma.rray(data, copy=False)
# Computes quantiles along axis (or globally)
if (axis is None):
result = _cihs_1D(data.compressed(), alpha)
else:
if data.ndim > 2:
raise ValueError("Array 'data' must be at most two dimensional, "
"but got data.ndim = %d" % data.ndim)
result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
return result
def compare_medians_ms(group_1, group_2, axis=None):
"""
Compares the medians from two independent groups along the given axis.
The comparison is performed using the McKean-Schrader estimate of the
standard error of the medians.
Parameters
----------
group_1 : array_like
First dataset.
group_2 : array_like
Second dataset.
axis : int, optional
Axis along which the medians are estimated. If None, the arrays are
flattened. If `axis` is not None, then `group_1` and `group_2`
should have the same shape.
Returns
-------
compare_medians_ms : {float, ndarray}
If `axis` is None, then returns a float, otherwise returns a 1-D
ndarray of floats with a length equal to the length of `group_1`
along `axis`.
"""
(med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
(std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
mstats.stde_median(group_2, axis=axis))
W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
return 1 - norm.cdf(W)
def idealfourths(data, axis=None):
"""
Returns an estimate of the lower and upper quartiles.
Uses the ideal fourths algorithm.
Parameters
----------
data : array_like
Input array.
axis : int, optional
Axis along which the quartiles are estimated. If None, the arrays are
flattened.
Returns
-------
idealfourths : {list of floats, masked array}
Returns the two internal values that divide `data` into four parts
using the ideal fourths algorithm either along the flattened array
(if `axis` is None) or along `axis` of `data`.
"""
def _idf(data):
x = data.compressed()
n = len(x)
if n < 3:
return [np.nan,np.nan]
(j,h) = divmod(n/4. + 5/12.,1)
j = int(j)
qlo = (1-h)*x[j-1] + h*x[j]
k = n - j
qup = (1-h)*x[k] + h*x[k-1]
return [qlo, qup]
data = ma.sort(data, axis=axis).view(MaskedArray)
if (axis is None):
return _idf(data)
else:
return ma.apply_along_axis(_idf, axis, data)
def rsh(data, points=None):
"""
Evaluates Rosenblatt's shifted histogram estimators for each point
on the dataset 'data'.
Parameters
----------
data : sequence
Input data. Masked values are ignored.
points : sequence
Sequence of points where to evaluate Rosenblatt shifted histogram.
If None, use the data.
"""
data = ma.array(data, copy=False)
if points is None:
points = data
else:
points = np.array(points, copy=False, ndmin=1)
if data.ndim != 1:
raise AttributeError("The input array should be 1D only !")
n = data.count()
r = idealfourths(data, axis=None)
h = 1.2 * (r[-1]-r[0]) / n**(1./5)
nhi = (data[:,None] <= points[None,:] + h).sum(0)
nlo = (data[:,None] < points[None,:] - h).sum(0)
return (nhi-nlo) / (2.*n*h)

@ -1,76 +0,0 @@
from __future__ import division, print_function, absolute_import
from numpy import vectorize, deprecate
from numpy.random import random_sample
__all__ = ['randwppf', 'randwcdf']
# XXX: Are these needed anymore?
#####################################
# General purpose continuous
######################################
@deprecate(message="Deprecated in scipy 0.14.0, use "
"distribution-specific rvs() method instead")
def randwppf(ppf, args=(), size=None):
"""
returns an array of randomly distributed integers of a distribution
whose percent point function (inverse of the CDF or quantile function)
is given.
args is a tuple of extra arguments to the ppf function (i.e. shape,
location, scale), and size is the size of the output. Note the ppf
function must accept an array of q values to compute over.
"""
U = random_sample(size=size)
return ppf(*(U,)+args)
@deprecate(message="Deprecated in scipy 0.14.0, use "
"distribution-specific rvs() method instead")
def randwcdf(cdf, mean=1.0, args=(), size=None):
"""
Returns an array of randomly distributed integers given a CDF.
Given a cumulative distribution function (CDF) returns an array of
randomly distributed integers that would satisfy the CDF.
Parameters
----------
cdf : function
CDF function that accepts a single value and `args`, and returns
an single value.
mean : float, optional
The mean of the distribution which helps the solver. Defaults
to 1.0.
args : tuple, optional
Extra arguments to the cdf function (i.e. shape, location, scale)
size : {int, None}, optional
Is the size of the output. If None, only 1 value will be returned.
Returns
-------
randwcdf : ndarray
Array of random numbers.
Notes
-----
Can use the ``scipy.stats.distributions.*.cdf`` functions for the
`cdf` parameter.
"""
import scipy.optimize as optimize
def _ppfopt(x, q, *nargs):
newargs = (x,)+nargs
return cdf(*newargs) - q
def _ppf(q, *nargs):
return optimize.fsolve(_ppfopt, mean, args=(q,)+nargs)
_vppf = vectorize(_ppf)
U = random_sample(size=size)
return _vppf(*(U,)+args)

@ -1,389 +0,0 @@
"""Utilities for writing code that runs on Python 2 and 3"""
# Copyright (c) 2010-2012 Benjamin Peterson
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import operator
import sys
import types
__author__ = "Benjamin Peterson <benjamin@python.org>"
__version__ = "1.2.0"
# True if we are running on Python 3.
PY3 = sys.version_info[0] == 3
if PY3:
string_types = str,
integer_types = int,
class_types = type,
text_type = str
binary_type = bytes
MAXSIZE = sys.maxsize
else:
string_types = basestring,
integer_types = (int, long)
class_types = (type, types.ClassType)
text_type = unicode
binary_type = str
if sys.platform.startswith("java"):
# Jython always uses 32 bits.
MAXSIZE = int((1 << 31) - 1)
else:
# It's possible to have sizeof(long) != sizeof(Py_ssize_t).
class X(object):
def __len__(self):
return 1 << 31
try:
len(X())
except OverflowError:
# 32-bit
MAXSIZE = int((1 << 31) - 1)
else:
# 64-bit
MAXSIZE = int((1 << 63) - 1)
del X
def _add_doc(func, doc):
"""Add documentation to a function."""
func.__doc__ = doc
def _import_module(name):
"""Import module, returning the module after the last dot."""
__import__(name)
return sys.modules[name]
class _LazyDescr(object):
def __init__(self, name):
self.name = name
def __get__(self, obj, tp):
result = self._resolve()
setattr(obj, self.name, result)
# This is a bit ugly, but it avoids running this again.
delattr(tp, self.name)
return result
class MovedModule(_LazyDescr):
def __init__(self, name, old, new=None):
super(MovedModule, self).__init__(name)
if PY3:
if new is None:
new = name
self.mod = new
else:
self.mod = old
def _resolve(self):
return _import_module(self.mod)
class MovedAttribute(_LazyDescr):
def __init__(self, name, old_mod, new_mod, old_attr=None, new_attr=None):
super(MovedAttribute, self).__init__(name)
if PY3:
if new_mod is None:
new_mod = name
self.mod = new_mod
if new_attr is None:
if old_attr is None:
new_attr = name
else:
new_attr = old_attr
self.attr = new_attr
else:
self.mod = old_mod
if old_attr is None:
old_attr = name
self.attr = old_attr
def _resolve(self):
module = _import_module(self.mod)
return getattr(module, self.attr)
class _MovedItems(types.ModuleType):
"""Lazy loading of moved objects"""
_moved_attributes = [
MovedAttribute("cStringIO", "cStringIO", "io", "StringIO"),
MovedAttribute("filter", "itertools", "builtins", "ifilter", "filter"),
MovedAttribute("input", "__builtin__", "builtins", "raw_input", "input"),
MovedAttribute("map", "itertools", "builtins", "imap", "map"),
MovedAttribute("reload_module", "__builtin__", "imp", "reload"),
MovedAttribute("reduce", "__builtin__", "functools"),
MovedAttribute("StringIO", "StringIO", "io"),
MovedAttribute("xrange", "__builtin__", "builtins", "xrange", "range"),
MovedAttribute("zip", "itertools", "builtins", "izip", "zip"),
MovedModule("builtins", "__builtin__"),
MovedModule("configparser", "ConfigParser"),
MovedModule("copyreg", "copy_reg"),
MovedModule("http_cookiejar", "cookielib", "http.cookiejar"),
MovedModule("http_cookies", "Cookie", "http.cookies"),
MovedModule("html_entities", "htmlentitydefs", "html.entities"),
MovedModule("html_parser", "HTMLParser", "html.parser"),
MovedModule("http_client", "httplib", "http.client"),
MovedModule("email_mime_multipart", "email.MIMEMultipart", "email.mime.multipart"),
MovedModule("email_mime_text", "email.MIMEText", "email.mime.text"),
MovedModule("email_mime_base", "email.MIMEBase", "email.mime.base"),
MovedModule("BaseHTTPServer", "BaseHTTPServer", "http.server"),
MovedModule("CGIHTTPServer", "CGIHTTPServer", "http.server"),
MovedModule("SimpleHTTPServer", "SimpleHTTPServer", "http.server"),
MovedModule("cPickle", "cPickle", "pickle"),
MovedModule("queue", "Queue"),
MovedModule("reprlib", "repr"),
MovedModule("socketserver", "SocketServer"),
MovedModule("tkinter", "Tkinter"),
MovedModule("tkinter_dialog", "Dialog", "tkinter.dialog"),
MovedModule("tkinter_filedialog", "FileDialog", "tkinter.filedialog"),
MovedModule("tkinter_scrolledtext", "ScrolledText", "tkinter.scrolledtext"),
MovedModule("tkinter_simpledialog", "SimpleDialog", "tkinter.simpledialog"),
MovedModule("tkinter_tix", "Tix", "tkinter.tix"),
MovedModule("tkinter_constants", "Tkconstants", "tkinter.constants"),
MovedModule("tkinter_dnd", "Tkdnd", "tkinter.dnd"),
MovedModule("tkinter_colorchooser", "tkColorChooser",
"tkinter.colorchooser"),
MovedModule("tkinter_commondialog", "tkCommonDialog",
"tkinter.commondialog"),
MovedModule("tkinter_tkfiledialog", "tkFileDialog", "tkinter.filedialog"),
MovedModule("tkinter_font", "tkFont", "tkinter.font"),
MovedModule("tkinter_messagebox", "tkMessageBox", "tkinter.messagebox"),
MovedModule("tkinter_tksimpledialog", "tkSimpleDialog",
"tkinter.simpledialog"),
MovedModule("urllib_robotparser", "robotparser", "urllib.robotparser"),
MovedModule("winreg", "_winreg"),
]
for attr in _moved_attributes:
setattr(_MovedItems, attr.name, attr)
del attr
moves = sys.modules[__name__ + ".moves"] = _MovedItems("moves")
def add_move(move):
"""Add an item to six.moves."""
setattr(_MovedItems, move.name, move)
def remove_move(name):
"""Remove item from six.moves."""
try:
delattr(_MovedItems, name)
except AttributeError:
try:
del moves.__dict__[name]
except KeyError:
raise AttributeError("no such move, %r" % (name,))
if PY3:
_meth_func = "__func__"
_meth_self = "__self__"
_func_code = "__code__"
_func_defaults = "__defaults__"
_iterkeys = "keys"
_itervalues = "values"
_iteritems = "items"
else:
_meth_func = "im_func"
_meth_self = "im_self"
_func_code = "func_code"
_func_defaults = "func_defaults"
_iterkeys = "iterkeys"
_itervalues = "itervalues"
_iteritems = "iteritems"
try:
advance_iterator = next
except NameError:
def advance_iterator(it):
return it.next()
next = advance_iterator
if PY3:
def get_unbound_function(unbound):
return unbound
Iterator = object
def callable(obj):
return any("__call__" in klass.__dict__ for klass in type(obj).__mro__)
else:
def get_unbound_function(unbound):
return unbound.im_func
class Iterator(object):
def next(self):
return type(self).__next__(self)
callable = callable
_add_doc(get_unbound_function,
"""Get the function out of a possibly unbound function""")
get_method_function = operator.attrgetter(_meth_func)
get_method_self = operator.attrgetter(_meth_self)
get_function_code = operator.attrgetter(_func_code)
get_function_defaults = operator.attrgetter(_func_defaults)
def iterkeys(d):
"""Return an iterator over the keys of a dictionary."""
return iter(getattr(d, _iterkeys)())
def itervalues(d):
"""Return an iterator over the values of a dictionary."""
return iter(getattr(d, _itervalues)())
def iteritems(d):
"""Return an iterator over the (key, value) pairs of a dictionary."""
return iter(getattr(d, _iteritems)())
if PY3:
def b(s):
return s.encode("latin-1")
def u(s):
return s
if sys.version_info[1] <= 1:
def int2byte(i):
return bytes((i,))
else:
# This is about 2x faster than the implementation above on 3.2+
int2byte = operator.methodcaller("to_bytes", 1, "big")
import io
StringIO = io.StringIO
BytesIO = io.BytesIO
else:
def b(s):
return s
def u(s):
return unicode(s, "unicode_escape")
int2byte = chr
import StringIO
StringIO = BytesIO = StringIO.StringIO
_add_doc(b, """Byte literal""")
_add_doc(u, """Text literal""")
if PY3:
import builtins # @UnresolvedImport
exec_ = getattr(builtins, "exec")
def reraise(tp, value, tb=None):
if value.__traceback__ is not tb:
raise value.with_traceback(tb)
raise value
print_ = getattr(builtins, "print")
del builtins
else:
def exec_(code, globs=None, locs=None):
"""Execute code in a namespace."""
if globs is None:
frame = sys._getframe(1)
globs = frame.f_globals
if locs is None:
locs = frame.f_locals
del frame
elif locs is None:
locs = globs
exec("""exec code in globs, locs""")
exec_("""def reraise(tp, value, tb=None):
raise tp, value, tb
""")
def print_(*args, **kwargs):
"""The new-style print function."""
fp = kwargs.pop("file", sys.stdout)
if fp is None:
return
def write(data):
if not isinstance(data, basestring):
data = str(data)
fp.write(data)
want_unicode = False
sep = kwargs.pop("sep", None)
if sep is not None:
if isinstance(sep, unicode):
want_unicode = True
elif not isinstance(sep, str):
raise TypeError("sep must be None or a string")
end = kwargs.pop("end", None)
if end is not None:
if isinstance(end, unicode):
want_unicode = True
elif not isinstance(end, str):
raise TypeError("end must be None or a string")
if kwargs:
raise TypeError("invalid keyword arguments to print()")
if not want_unicode:
for arg in args:
if isinstance(arg, unicode):
want_unicode = True
break
if want_unicode:
newline = unicode("\n")
space = unicode(" ")
else:
newline = "\n"
space = " "
if sep is None:
sep = space
if end is None:
end = newline
for i, arg in enumerate(args):
if i:
write(sep)
write(arg)
write(end)
_add_doc(reraise, """Reraise an exception.""")
def with_metaclass(meta, base=object):
"""Create a base class with a metaclass."""
return meta("NewBase", (base,), {})

File diff suppressed because it is too large Load Diff

@ -1,16 +1,23 @@
from __future__ import division, print_function, absolute_import
import inspect
import warnings
import pickle
import numpy as np
import numpy.testing as npt
from numpy.testing import assert_allclose
import numpy.ma.testutils as ma_npt
from scipy._lib._version import NumpyVersion
from wafo.stats._util import getargspec_no_self as _getargspec
from wafo import stats
NUMPY_BELOW_1_7 = NumpyVersion(np.__version__) < '1.7.0'
def check_named_results(res, attributes, ma=False):
for i, attr in enumerate(attributes):
if ma:
ma_npt.assert_equal(res[i], getattr(res, attr))
else:
npt.assert_equal(res[i], getattr(res, attr))
def check_normalization(distfn, args, distname):
@ -94,17 +101,19 @@ def check_private_entropy(distfn, args, superclass):
def check_edge_support(distfn, args):
# Make sure the x=self.a and self.b are handled correctly.
# Make sure that x=self.a and self.b are handled correctly.
x = [distfn.a, distfn.b]
if isinstance(distfn, stats.rv_continuous):
npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0])
npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0])
if isinstance(distfn, stats.rv_discrete):
x = [distfn.a - 1, distfn.b]
npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0])
npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0])
npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0])
if distfn.name not in ('skellam', 'dlaplace'):
# with a = -inf, log(0) generates warnings
npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0])
npt.assert_equal(distfn.logsf(x, *args), [0.0, -np.inf])
if isinstance(distfn, stats.rv_discrete):
x = [distfn.a-1, distfn.b]
npt.assert_equal(distfn.ppf([0.0, 1.0], *args), x)
npt.assert_equal(distfn.isf([0.0, 1.0], *args), x[::-1])
@ -117,12 +126,12 @@ def check_named_args(distfn, x, shape_args, defaults, meths):
## Check calling w/ named arguments.
# check consistency of shapes, numargs and _parse signature
signature = inspect.getargspec(distfn._parse_args)
signature = _getargspec(distfn._parse_args)
npt.assert_(signature.varargs is None)
npt.assert_(signature.keywords is None)
npt.assert_(signature.defaults == defaults)
npt.assert_(list(signature.defaults) == list(defaults))
shape_argnames = signature.args[1:-len(defaults)] # self, a, b, loc=0, scale=1
shape_argnames = signature.args[:-len(defaults)] # a, b, loc=0, scale=1
if distfn.shapes:
shapes_ = distfn.shapes.replace(',', ' ').split()
else:
@ -152,3 +161,115 @@ def check_named_args(distfn, x, shape_args, defaults, meths):
k.update({'kaboom': 42})
npt.assert_raises(TypeError, distfn.cdf, x, **k)
def check_random_state_property(distfn, args):
# check the random_state attribute of a distribution *instance*
# This test fiddles with distfn.random_state. This breaks other tests,
# hence need to save it and then restore.
rndm = distfn.random_state
# baseline: this relies on the global state
np.random.seed(1234)
distfn.random_state = None
r0 = distfn.rvs(*args, size=8)
# use an explicit instance-level random_state
distfn.random_state = 1234
r1 = distfn.rvs(*args, size=8)
npt.assert_equal(r0, r1)
distfn.random_state = np.random.RandomState(1234)
r2 = distfn.rvs(*args, size=8)
npt.assert_equal(r0, r2)
# can override the instance-level random_state for an individual .rvs call
distfn.random_state = 2
orig_state = distfn.random_state.get_state()
r3 = distfn.rvs(*args, size=8, random_state=np.random.RandomState(1234))
npt.assert_equal(r0, r3)
# ... and that does not alter the instance-level random_state!
npt.assert_equal(distfn.random_state.get_state(), orig_state)
# finally, restore the random_state
distfn.random_state = rndm
def check_meth_dtype(distfn, arg, meths):
q0 = [0.25, 0.5, 0.75]
x0 = distfn.ppf(q0, *arg)
x_cast = [x0.astype(tp) for tp in
(np.int_, np.float16, np.float32, np.float64)]
for x in x_cast:
# casting may have clipped the values, exclude those
distfn._argcheck(*arg)
x = x[(distfn.a < x) & (x < distfn.b)]
for meth in meths:
val = meth(x, *arg)
npt.assert_(val.dtype == np.float_)
def check_ppf_dtype(distfn, arg):
q0 = np.asarray([0.25, 0.5, 0.75])
q_cast = [q0.astype(tp) for tp in (np.float16, np.float32, np.float64)]
for q in q_cast:
for meth in [distfn.ppf, distfn.isf]:
val = meth(q, *arg)
npt.assert_(val.dtype == np.float_)
def check_cmplx_deriv(distfn, arg):
# Distributions allow complex arguments.
def deriv(f, x, *arg):
x = np.asarray(x)
h = 1e-10
return (f(x + h*1j, *arg)/h).imag
x0 = distfn.ppf([0.25, 0.51, 0.75], *arg)
x_cast = [x0.astype(tp) for tp in
(np.int_, np.float16, np.float32, np.float64)]
for x in x_cast:
# casting may have clipped the values, exclude those
distfn._argcheck(*arg)
x = x[(distfn.a < x) & (x < distfn.b)]
pdf, cdf, sf = distfn.pdf(x, *arg), distfn.cdf(x, *arg), distfn.sf(x, *arg)
assert_allclose(deriv(distfn.cdf, x, *arg), pdf, rtol=1e-5)
assert_allclose(deriv(distfn.logcdf, x, *arg), pdf/cdf, rtol=1e-5)
assert_allclose(deriv(distfn.sf, x, *arg), -pdf, rtol=1e-5)
assert_allclose(deriv(distfn.logsf, x, *arg), -pdf/sf, rtol=1e-5)
assert_allclose(deriv(distfn.logpdf, x, *arg),
deriv(distfn.pdf, x, *arg) / distfn.pdf(x, *arg),
rtol=1e-5)
def check_pickling(distfn, args):
# check that a distribution instance pickles and unpickles
# pay special attention to the random_state property
# save the random_state (restore later)
rndm = distfn.random_state
distfn.random_state = 1234
distfn.rvs(*args, size=8)
s = pickle.dumps(distfn)
r0 = distfn.rvs(*args, size=8)
unpickled = pickle.loads(s)
r1 = unpickled.rvs(*args, size=8)
npt.assert_equal(r0, r1)
# also smoke test some methods
medians = [distfn.ppf(0.5, *args), unpickled.ppf(0.5, *args)]
npt.assert_equal(medians[0], medians[1])
npt.assert_equal(distfn.cdf(medians[0], *args),
unpickled.cdf(medians[1], *args))
# restore the random_state
distfn.random_state = rndm

@ -1,238 +0,0 @@
from __future__ import division, print_function, absolute_import
import numpy as np
from numpy.testing import assert_array_almost_equal, run_module_suite
from scipy.stats import \
binned_statistic, binned_statistic_2d, binned_statistic_dd
class TestBinnedStatistic(object):
@classmethod
def setup_class(cls):
np.random.seed(9865)
cls.x = np.random.random(100)
cls.y = np.random.random(100)
cls.v = np.random.random(100)
cls.X = np.random.random((100, 3))
def test_1d_count(self):
x = self.x
v = self.v
count1, edges1, bc = binned_statistic(x, v, 'count', bins=10)
count2, edges2 = np.histogram(x, bins=10)
assert_array_almost_equal(count1, count2)
assert_array_almost_equal(edges1, edges2)
def test_1d_sum(self):
x = self.x
v = self.v
sum1, edges1, bc = binned_statistic(x, v, 'sum', bins=10)
sum2, edges2 = np.histogram(x, bins=10, weights=v)
assert_array_almost_equal(sum1, sum2)
assert_array_almost_equal(edges1, edges2)
def test_1d_mean(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'mean', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.mean, bins=10)
assert_array_almost_equal(stat1, stat2)
assert_array_almost_equal(edges1, edges2)
def test_1d_std(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'std', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.std, bins=10)
assert_array_almost_equal(stat1, stat2)
assert_array_almost_equal(edges1, edges2)
def test_1d_median(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'median', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.median, bins=10)
assert_array_almost_equal(stat1, stat2)
assert_array_almost_equal(edges1, edges2)
def test_1d_bincode(self):
x = self.x[:20]
v = self.v[:20]
count1, edges1, bc = binned_statistic(x, v, 'count', bins=3)
bc2 = np.array([3, 2, 1, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 1, 2, 3, 1,
1, 2, 1])
bcount = [(bc == i).sum() for i in np.unique(bc)]
assert_array_almost_equal(bc, bc2)
assert_array_almost_equal(bcount, count1)
def test_1d_range_keyword(self):
# Regression test for gh-3063, range can be (min, max) or [(min, max)]
np.random.seed(9865)
x = np.arange(30)
data = np.random.random(30)
mean, bins, _ = binned_statistic(x[:15], data[:15])
mean_range, bins_range, _ = binned_statistic(x, data, range=[(0, 14)])
mean_range2, bins_range2, _ = binned_statistic(x, data, range=(0, 14))
assert_array_almost_equal(mean, mean_range)
assert_array_almost_equal(bins, bins_range)
assert_array_almost_equal(mean, mean_range2)
assert_array_almost_equal(bins, bins_range2)
def test_2d_count(self):
x = self.x
y = self.y
v = self.v
count1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'count', bins=5)
count2, binx2, biny2 = np.histogram2d(x, y, bins=5)
assert_array_almost_equal(count1, count2)
assert_array_almost_equal(binx1, binx2)
assert_array_almost_equal(biny1, biny2)
def test_2d_sum(self):
x = self.x
y = self.y
v = self.v
sum1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'sum', bins=5)
sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v)
assert_array_almost_equal(sum1, sum2)
assert_array_almost_equal(binx1, binx2)
assert_array_almost_equal(biny1, biny2)
def test_2d_mean(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'mean', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
assert_array_almost_equal(stat1, stat2)
assert_array_almost_equal(binx1, binx2)
assert_array_almost_equal(biny1, biny2)
def test_2d_std(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'std', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.std, bins=5)
assert_array_almost_equal(stat1, stat2)
assert_array_almost_equal(binx1, binx2)
assert_array_almost_equal(biny1, biny2)
def test_2d_median(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'median', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.median, bins=5)
assert_array_almost_equal(stat1, stat2)
assert_array_almost_equal(binx1, binx2)
assert_array_almost_equal(biny1, biny2)
def test_2d_bincode(self):
x = self.x[:20]
y = self.y[:20]
v = self.v[:20]
count1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'count', bins=3)
bc2 = np.array([17, 11, 6, 16, 11, 17, 18, 17, 17, 7, 6, 18, 16,
6, 11, 16, 6, 6, 11, 8])
bcount = [(bc == i).sum() for i in np.unique(bc)]
assert_array_almost_equal(bc, bc2)
count1adj = count1[count1.nonzero()]
assert_array_almost_equal(bcount, count1adj)
def test_dd_count(self):
X = self.X
v = self.v
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
count2, edges2 = np.histogramdd(X, bins=3)
assert_array_almost_equal(count1, count2)
assert_array_almost_equal(edges1, edges2)
def test_dd_sum(self):
X = self.X
v = self.v
sum1, edges1, bc = binned_statistic_dd(X, v, 'sum', bins=3)
sum2, edges2 = np.histogramdd(X, bins=3, weights=v)
assert_array_almost_equal(sum1, sum2)
assert_array_almost_equal(edges1, edges2)
def test_dd_mean(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'mean', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.mean, bins=3)
assert_array_almost_equal(stat1, stat2)
assert_array_almost_equal(edges1, edges2)
def test_dd_std(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'std', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.std, bins=3)
assert_array_almost_equal(stat1, stat2)
assert_array_almost_equal(edges1, edges2)
def test_dd_median(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'median', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.median, bins=3)
assert_array_almost_equal(stat1, stat2)
assert_array_almost_equal(edges1, edges2)
def test_dd_bincode(self):
X = self.X[:20]
v = self.v[:20]
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
bc2 = np.array([63, 33, 86, 83, 88, 67, 57, 33, 42, 41, 82, 83, 92,
32, 36, 91, 43, 87, 81, 81])
bcount = [(bc == i).sum() for i in np.unique(bc)]
assert_array_almost_equal(bc, bc2)
count1adj = count1[count1.nonzero()]
assert_array_almost_equal(bcount, count1adj)
if __name__ == "__main__":
run_module_suite()

@ -1,202 +0,0 @@
from __future__ import division, print_function, absolute_import
import numpy as np
from numpy.testing import (run_module_suite, assert_equal, assert_array_equal,
assert_array_almost_equal, assert_approx_equal, assert_raises,
assert_allclose)
from scipy.special import xlogy
from scipy.stats.contingency import margins, expected_freq, chi2_contingency
def test_margins():
a = np.array([1])
m = margins(a)
assert_equal(len(m), 1)
m0 = m[0]
assert_array_equal(m0, np.array([1]))
a = np.array([[1]])
m0, m1 = margins(a)
expected0 = np.array([[1]])
expected1 = np.array([[1]])
assert_array_equal(m0, expected0)
assert_array_equal(m1, expected1)
a = np.arange(12).reshape(2, 6)
m0, m1 = margins(a)
expected0 = np.array([[15], [51]])
expected1 = np.array([[6, 8, 10, 12, 14, 16]])
assert_array_equal(m0, expected0)
assert_array_equal(m1, expected1)
a = np.arange(24).reshape(2, 3, 4)
m0, m1, m2 = margins(a)
expected0 = np.array([[[66]], [[210]]])
expected1 = np.array([[[60], [92], [124]]])
expected2 = np.array([[[60, 66, 72, 78]]])
assert_array_equal(m0, expected0)
assert_array_equal(m1, expected1)
assert_array_equal(m2, expected2)
def test_expected_freq():
assert_array_equal(expected_freq([1]), np.array([1.0]))
observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
e = expected_freq(observed)
assert_array_equal(e, np.ones_like(observed))
observed = np.array([[10, 10, 20], [20, 20, 20]])
e = expected_freq(observed)
correct = np.array([[12., 12., 16.], [18., 18., 24.]])
assert_array_almost_equal(e, correct)
def test_chi2_contingency_trivial():
# Some very simple tests for chi2_contingency.
# A trivial case
obs = np.array([[1, 2], [1, 2]])
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
assert_equal(chi2, 0.0)
assert_equal(p, 1.0)
assert_equal(dof, 1)
assert_array_equal(obs, expected)
# A *really* trivial case: 1-D data.
obs = np.array([1, 2, 3])
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
assert_equal(chi2, 0.0)
assert_equal(p, 1.0)
assert_equal(dof, 0)
assert_array_equal(obs, expected)
def test_chi2_contingency_R():
# Some test cases that were computed independently, using R.
Rcode = \
"""
# Data vector.
data <- c(
12, 34, 23, 4, 47, 11,
35, 31, 11, 34, 10, 18,
12, 32, 9, 18, 13, 19,
12, 12, 14, 9, 33, 25
)
# Create factor tags:r=rows, c=columns, t=tiers
r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
# 3-way Chi squared test of independence
s = summary(xtabs(data~r+c+t))
print(s)
"""
Routput = \
"""
Call: xtabs(formula = data ~ r + c + t)
Number of cases in table: 478
Number of factors: 3
Test for independence of all factors:
Chisq = 102.17, df = 17, p-value = 3.514e-14
"""
obs = np.array(
[[[12, 34, 23],
[35, 31, 11],
[12, 32, 9],
[12, 12, 14]],
[[4, 47, 11],
[34, 10, 18],
[18, 13, 19],
[9, 33, 25]]])
chi2, p, dof, expected = chi2_contingency(obs)
assert_approx_equal(chi2, 102.17, significant=5)
assert_approx_equal(p, 3.514e-14, significant=4)
assert_equal(dof, 17)
Rcode = \
"""
# Data vector.
data <- c(
#
12, 17,
11, 16,
#
11, 12,
15, 16,
#
23, 15,
30, 22,
#
14, 17,
15, 16
)
# Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
# 4-way Chi squared test of independence
s = summary(xtabs(data~r+c+d+t))
print(s)
"""
Routput = \
"""
Call: xtabs(formula = data ~ r + c + d + t)
Number of cases in table: 262
Number of factors: 4
Test for independence of all factors:
Chisq = 8.758, df = 11, p-value = 0.6442
"""
obs = np.array(
[[[[12, 17],
[11, 16]],
[[11, 12],
[15, 16]]],
[[[23, 15],
[30, 22]],
[[14, 17],
[15, 16]]]])
chi2, p, dof, expected = chi2_contingency(obs)
assert_approx_equal(chi2, 8.758, significant=4)
assert_approx_equal(p, 0.6442, significant=4)
assert_equal(dof, 11)
def test_chi2_contingency_g():
c = np.array([[15, 60], [15, 90]])
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False)
assert_allclose(g, 2*xlogy(c, c/e).sum())
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True)
c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
c = np.array([[10, 12, 10], [12, 10, 10]])
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
assert_allclose(g, 2*xlogy(c, c/e).sum())
def test_chi2_contingency_bad_args():
# Test that "bad" inputs raise a ValueError.
# Negative value in the array of observed frequencies.
obs = np.array([[-1, 10], [1, 2]])
assert_raises(ValueError, chi2_contingency, obs)
# The zeros in this will result in zeros in the array
# of expected frequencies.
obs = np.array([[0, 1], [0, 1]])
assert_raises(ValueError, chi2_contingency, obs)
# A degenerate case: `observed` has size 0.
obs = np.empty((0, 8))
assert_raises(ValueError, chi2_contingency, obs)
if __name__ == "__main__":
run_module_suite()

@ -7,11 +7,16 @@ import numpy.testing as npt
from scipy import integrate
from wafo import stats
from wafo.stats.tests.common_tests import (check_normalization, check_moment,
check_mean_expect,
check_var_expect, check_skew_expect, check_kurt_expect,
check_entropy, check_private_entropy, NUMPY_BELOW_1_7,
check_edge_support, check_named_args)
from wafo.stats.tests.common_tests import (check_normalization, check_moment, check_mean_expect,
check_var_expect, check_skew_expect,
check_kurt_expect, check_entropy,
check_private_entropy,
check_edge_support, check_named_args,
check_random_state_property,
check_meth_dtype, check_ppf_dtype, check_cmplx_deriv,
check_pickling)
from wafo.stats._distr_params import distcont
@ -26,9 +31,12 @@ These tests currently check only/mostly for serious errors and exceptions,
not for numerically exact results.
"""
# Note that you need to add new distributions you want tested
# to _distr_params
DECIMAL = 5 # specify the precision of the tests # increased from 0 to 5
## Last four of these fail all around. Need to be checked
# Last four of these fail all around. Need to be checked
distcont_extra = [
['betaprime', (100, 86)],
['fatiguelife', (5,)],
@ -41,58 +49,37 @@ distcont_extra = [
]
# for testing only specific functions
# distcont = [
## ['fatiguelife', (29,)], #correction numargs = 1
## ['loggamma', (0.41411931826052117,)]]
# for testing ticket:767
# distcont = [
## ['genextreme', (3.3184017469423535,)],
## ['genextreme', (0.01,)],
## ['genextreme', (0.00001,)],
## ['genextreme', (0.0,)],
## ['genextreme', (-0.01,)]
## ]
# distcont = [['gumbel_l', ()],
## ['gumbel_r', ()],
## ['norm', ()]
## ]
# distcont = [['norm', ()]]
distmissing = ['wald', 'gausshyper', 'genexpon', 'rv_continuous',
'loglaplace', 'rdist', 'semicircular', 'invweibull', 'ksone',
'cosine', 'kstwobign', 'truncnorm', 'mielke', 'recipinvgauss', 'levy',
'johnsonsu', 'levy_l', 'powernorm', 'wrapcauchy',
'johnsonsb', 'truncexpon', 'rice', 'invgauss', 'invgamma',
'powerlognorm']
distmiss = [[dist,args] for dist,args in distcont if dist in distmissing]
distslow = ['rdist', 'gausshyper', 'recipinvgauss', 'ksone', 'genexpon',
'vonmises', 'vonmises_line', 'mielke', 'semicircular',
'cosine', 'invweibull', 'powerlognorm', 'johnsonsu', 'kstwobign']
# distslow are sorted by speed (very slow to slow)
# NB: not needed anymore?
def _silence_fp_errors(func):
# warning: don't apply to test_ functions as is, then those will be skipped
def wrap(*a, **kw):
olderr = np.seterr(all='ignore')
try:
return func(*a, **kw)
finally:
np.seterr(**olderr)
wrap.__name__ = func.__name__
return wrap
# These distributions fail the complex derivative test below.
# Here 'fail' mean produce wrong results and/or raise exceptions, depending
# on the implementation details of corresponding special functions.
# cf https://github.com/scipy/scipy/pull/4979 for a discussion.
fails_cmplx = set(['alpha', 'beta', 'betaprime', 'burr12', 'chi', 'chi2', 'dgamma',
'dweibull', 'erlang', 'expon', 'exponnorm', 'exponpow',
'exponweib', 'f', 'fatiguelife', 'foldnorm', 'frechet_l',
'frechet_r', 'gamma', 'gausshyper', 'genexpon',
'genextreme', 'gengamma', 'genlogistic', 'gennorm',
'genpareto', 'gilbrat', 'gompertz', 'halfcauchy',
'halfgennorm', 'halflogistic', 'halfnorm', 'invgamma',
'invgauss', 'johnsonsb', 'johnsonsu', 'ksone', 'kstwobign',
'levy_l', 'loggamma', 'logistic', 'lognorm', 'lomax',
'maxwell', 'nakagami', 'ncf', 'nct', 'ncx2', 'norm',
'pearson3', 'powerlognorm', 'powernorm', 'rayleigh',
'recipinvgauss', 'rice', 'skewnorm', 't', 'truncexpon', 'truncnorm',
'tukeylambda', 'vonmises', 'vonmises_line', 'wald',
'weibull_min'])
def test_cont_basic():
# this test skips slow distributions
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=integrate.IntegrationWarning)
warnings.filterwarnings('ignore',
category=integrate.IntegrationWarning)
for distname, arg in distcont[:]:
if distname in distslow:
continue
@ -106,17 +93,17 @@ def test_cont_basic():
sv = rvs.var()
m, v = distfn.stats(*arg)
yield check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn, \
distname + 'sample mean test'
yield (check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn,
distname + 'sample mean test')
yield check_cdf_ppf, distfn, arg, distname
yield check_sf_isf, distfn, arg, distname
yield check_pdf, distfn, arg, distname
yield check_pdf_logpdf, distfn, arg, distname
yield check_cdf_logcdf, distfn, arg, distname
yield check_sf_logsf, distfn, arg, distname
if distname in distmissing:
alpha = 0.01
yield check_distribution_rvs, distname, arg, alpha, rvs
alpha = 0.01
yield check_distribution_rvs, distname, arg, alpha, rvs
locscale_defaults = (0, 1)
meths = [distfn.pdf, distfn.logpdf, distfn.cdf, distfn.logcdf,
@ -126,28 +113,35 @@ def test_cont_basic():
'pareto': 1.5, 'tukeylambda': 0.3}
x = spec_x.get(distname, 0.5)
yield check_named_args, distfn, x, arg, locscale_defaults, meths
yield check_random_state_property, distfn, arg
# yield check_pickling, distfn, arg
# Entropy
skp = npt.dec.skipif
yield check_entropy, distfn, arg, distname
if distfn.numargs == 0:
yield skp(NUMPY_BELOW_1_7)(check_vecentropy), distfn, arg
yield check_vecentropy, distfn, arg
if distfn.__class__._entropy != stats.rv_continuous._entropy:
yield check_private_entropy, distfn, arg, stats.rv_continuous
yield check_edge_support, distfn, arg
yield check_meth_dtype, distfn, arg, meths
yield check_ppf_dtype, distfn, arg
yield skp(distname in fails_cmplx)(check_cmplx_deriv), distfn, arg
knf = npt.dec.knownfailureif
yield knf(distname == 'truncnorm')(check_ppf_private), distfn, \
arg, distname
yield (knf(distname == 'truncnorm')(check_ppf_private), distfn,
arg, distname)
@npt.dec.slow
def test_cont_basic_slow():
# same as above for slow distributions
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=integrate.IntegrationWarning)
warnings.filterwarnings('ignore',
category=integrate.IntegrationWarning)
for distname, arg in distcont[:]:
if distname not in distslow:
continue
@ -156,12 +150,12 @@ def test_cont_basic_slow():
distfn = getattr(stats, distname)
np.random.seed(765456)
sn = 500
rvs = distfn.rvs(size=sn,*arg)
rvs = distfn.rvs(size=sn, *arg)
sm = rvs.mean()
sv = rvs.var()
m, v = distfn.stats(*arg)
yield check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn, \
distname + 'sample mean test'
yield (check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn,
distname + 'sample mean test')
yield check_cdf_ppf, distfn, arg, distname
yield check_sf_isf, distfn, arg, distname
yield check_pdf, distfn, arg, distname
@ -169,9 +163,9 @@ def test_cont_basic_slow():
yield check_cdf_logcdf, distfn, arg, distname
yield check_sf_logsf, distfn, arg, distname
# yield check_oth, distfn, arg # is still missing
if distname in distmissing:
alpha = 0.01
yield check_distribution_rvs, distname, arg, alpha, rvs
alpha = 0.01
yield check_distribution_rvs, distname, arg, alpha, rvs
locscale_defaults = (0, 1)
meths = [distfn.pdf, distfn.logpdf, distfn.cdf, distfn.logcdf,
@ -183,6 +177,8 @@ def test_cont_basic_slow():
elif distname == 'ksone':
arg = (3,)
yield check_named_args, distfn, x, arg, locscale_defaults, meths
yield check_random_state_property, distfn, arg
# yield check_pickling, distfn, arg
# Entropy
skp = npt.dec.skipif
@ -190,17 +186,22 @@ def test_cont_basic_slow():
yield skp(ks_cond)(check_entropy), distfn, arg, distname
if distfn.numargs == 0:
yield skp(NUMPY_BELOW_1_7)(check_vecentropy), distfn, arg
yield check_vecentropy, distfn, arg
if distfn.__class__._entropy != stats.rv_continuous._entropy:
yield check_private_entropy, distfn, arg, stats.rv_continuous
yield check_edge_support, distfn, arg
yield check_meth_dtype, distfn, arg, meths
yield check_ppf_dtype, distfn, arg
yield skp(distname in fails_cmplx)(check_cmplx_deriv), distfn, arg
@npt.dec.slow
def test_moments():
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=integrate.IntegrationWarning)
warnings.filterwarnings('ignore',
category=integrate.IntegrationWarning)
knf = npt.dec.knownfailureif
fail_normalization = set(['vonmises', 'ksone'])
fail_higher = set(['vonmises', 'ksone', 'ncf'])
@ -209,28 +210,30 @@ def test_moments():
continue
distfn = getattr(stats, distname)
m, v, s, k = distfn.stats(*arg, moments='mvsk')
cond1, cond2 = distname in fail_normalization, distname in fail_higher
cond1 = distname in fail_normalization
cond2 = distname in fail_higher
msg = distname + ' fails moments'
yield knf(cond1, msg)(check_normalization), distfn, arg, distname
yield knf(cond2, msg)(check_mean_expect), distfn, arg, m, distname
yield knf(cond2, msg)(check_var_expect), distfn, arg, m, v, distname
yield knf(cond2, msg)(check_skew_expect), distfn, arg, m, v, s, \
distname
yield knf(cond2, msg)(check_kurt_expect), distfn, arg, m, v, k, \
distname
yield (knf(cond2, msg)(check_var_expect), distfn, arg, m, v,
distname)
yield (knf(cond2, msg)(check_skew_expect), distfn, arg, m, v, s,
distname)
yield (knf(cond2, msg)(check_kurt_expect), distfn, arg, m, v, k,
distname)
yield check_loc_scale, distfn, arg, m, v, distname
yield check_moment, distfn, arg, m, v, distname
def check_sample_meanvar_(distfn, arg, m, v, sm, sv, sn, msg):
# this did not work, skipped silently by nose
if not np.isinf(m):
if np.isfinite(m):
check_sample_mean(sm, sv, sn, m)
if not np.isinf(v):
if np.isfinite(v):
check_sample_var(sv, sn, v)
def check_sample_mean(sm,v,n, popmean):
def check_sample_mean(sm, v, n, popmean):
# from stats.stats.ttest_1samp(a, popmean):
# Calculates the t-obtained for the independent samples T-test on ONE group
# of scores a, given a population mean.
@ -243,31 +246,32 @@ def check_sample_mean(sm,v,n, popmean):
# return t,prob
npt.assert_(prob > 0.01, 'mean fail, t,prob = %f, %f, m, sm=%f,%f' %
(t, prob, popmean, sm))
(t, prob, popmean, sm))
def check_sample_var(sv,n, popvar):
# two-sided chisquare test for sample variance equal to hypothesized variance
def check_sample_var(sv, n, popvar):
# two-sided chisquare test for sample variance equal to
# hypothesized variance
df = n-1
chi2 = (n-1)*popvar/float(popvar)
pval = stats.chisqprob(chi2,df)*2
pval = stats.distributions.chi2.sf(chi2, df) * 2
npt.assert_(pval > 0.01, 'var fail, t, pval = %f, %f, v, sv=%f, %f' %
(chi2,pval,popvar,sv))
(chi2, pval, popvar, sv))
def check_cdf_ppf(distfn,arg,msg):
def check_cdf_ppf(distfn, arg, msg):
values = [0.001, 0.5, 0.999]
npt.assert_almost_equal(distfn.cdf(distfn.ppf(values, *arg), *arg),
values, decimal=DECIMAL, err_msg=msg +
' - cdf-ppf roundtrip')
def check_sf_isf(distfn,arg,msg):
npt.assert_almost_equal(distfn.sf(distfn.isf([0.1,0.5,0.9], *arg), *arg),
[0.1,0.5,0.9], decimal=DECIMAL, err_msg=msg +
def check_sf_isf(distfn, arg, msg):
npt.assert_almost_equal(distfn.sf(distfn.isf([0.1, 0.5, 0.9], *arg), *arg),
[0.1, 0.5, 0.9], decimal=DECIMAL, err_msg=msg +
' - sf-isf roundtrip')
npt.assert_almost_equal(distfn.cdf([0.1,0.9], *arg),
1.0-distfn.sf([0.1,0.9], *arg),
npt.assert_almost_equal(distfn.cdf([0.1, 0.9], *arg),
1.0 - distfn.sf([0.1, 0.9], *arg),
decimal=DECIMAL, err_msg=msg +
' - cdf-sf relationship')
@ -278,15 +282,16 @@ def check_pdf(distfn, arg, msg):
eps = 1e-6
pdfv = distfn.pdf(median, *arg)
if (pdfv < 1e-4) or (pdfv > 1e4):
# avoid checking a case where pdf is close to zero or huge (singularity)
# avoid checking a case where pdf is close to zero or
# huge (singularity)
median = median + 0.1
pdfv = distfn.pdf(median, *arg)
cdfdiff = (distfn.cdf(median + eps, *arg) -
distfn.cdf(median - eps, *arg))/eps/2.0
# replace with better diff and better test (more points),
# actually, this works pretty well
npt.assert_almost_equal(pdfv, cdfdiff,
decimal=DECIMAL, err_msg=msg + ' - cdf-pdf relationship')
msg += ' - cdf-pdf relationship'
npt.assert_almost_equal(pdfv, cdfdiff, decimal=DECIMAL, err_msg=msg)
def check_pdf_logpdf(distfn, args, msg):
@ -297,7 +302,8 @@ def check_pdf_logpdf(distfn, args, msg):
logpdf = distfn.logpdf(vals, *args)
pdf = pdf[pdf != 0]
logpdf = logpdf[np.isfinite(logpdf)]
npt.assert_almost_equal(np.log(pdf), logpdf, decimal=7, err_msg=msg + " - logpdf-log(pdf) relationship")
msg += " - logpdf-log(pdf) relationship"
npt.assert_almost_equal(np.log(pdf), logpdf, decimal=7, err_msg=msg)
def check_sf_logsf(distfn, args, msg):
@ -308,7 +314,8 @@ def check_sf_logsf(distfn, args, msg):
logsf = distfn.logsf(vals, *args)
sf = sf[sf != 0]
logsf = logsf[np.isfinite(logsf)]
npt.assert_almost_equal(np.log(sf), logsf, decimal=7, err_msg=msg + " - logsf-log(sf) relationship")
msg += " - logsf-log(sf) relationship"
npt.assert_almost_equal(np.log(sf), logsf, decimal=7, err_msg=msg)
def check_cdf_logcdf(distfn, args, msg):
@ -319,24 +326,24 @@ def check_cdf_logcdf(distfn, args, msg):
logcdf = distfn.logcdf(vals, *args)
cdf = cdf[cdf != 0]
logcdf = logcdf[np.isfinite(logcdf)]
npt.assert_almost_equal(np.log(cdf), logcdf, decimal=7, err_msg=msg + " - logcdf-log(cdf) relationship")
msg += " - logcdf-log(cdf) relationship"
npt.assert_almost_equal(np.log(cdf), logcdf, decimal=7, err_msg=msg)
def check_distribution_rvs(dist, args, alpha, rvs):
# test from scipy.stats.tests
# this version reuses existing random variables
D,pval = stats.kstest(rvs, dist, args=args, N=1000)
D, pval = stats.kstest(rvs, dist, args=args, N=1000)
if (pval < alpha):
D,pval = stats.kstest(dist,'',args=args, N=1000)
D, pval = stats.kstest(dist, '', args=args, N=1000)
npt.assert_(pval > alpha, "D = " + str(D) + "; pval = " + str(pval) +
"; alpha = " + str(alpha) + "\nargs = " + str(args))
"; alpha = " + str(alpha) + "\nargs = " + str(args))
def check_vecentropy(distfn, args):
npt.assert_equal(distfn.vecentropy(*args), distfn._entropy(*args))
@npt.dec.skipif(NUMPY_BELOW_1_7)
def check_loc_scale(distfn, arg, m, v, msg):
loc, scale = 10.0, 10.0
mt, vt = distfn.stats(loc=loc, scale=scale, *arg)
@ -345,7 +352,7 @@ def check_loc_scale(distfn, arg, m, v, msg):
def check_ppf_private(distfn, arg, msg):
#fails by design for truncnorm self.nb not defined
# fails by design for truncnorm self.nb not defined
ppfs = distfn._ppf(np.array([0.1, 0.5, 0.9]), *arg)
npt.assert_(not np.any(np.isnan(ppfs)), msg + 'ppf private is nan')

@ -5,18 +5,26 @@ import numpy as np
from scipy._lib.six import xrange
from wafo import stats
from wafo.stats.tests.common_tests import (check_normalization, check_moment,
check_mean_expect,
check_var_expect, check_skew_expect, check_kurt_expect,
check_entropy, check_private_entropy, check_edge_support,
check_named_args)
from wafo.stats.tests.common_tests import (check_normalization, check_moment, check_mean_expect,
check_var_expect, check_skew_expect,
check_kurt_expect, check_entropy,
check_private_entropy, check_edge_support,
check_named_args, check_random_state_property,
check_pickling)
from wafo.stats._distr_params import distdiscrete
knf = npt.dec.knownfailureif
vals = ([1, 2, 3, 4], [0.1, 0.2, 0.3, 0.4])
distdiscrete += [[stats.rv_discrete(values=vals), ()]]
def test_discrete_basic():
for distname, arg in distdiscrete:
distfn = getattr(stats, distname)
try:
distfn = getattr(stats, distname)
except TypeError:
distfn = distname
distname = 'sample distribution'
np.random.seed(9765456)
rvs = distfn.rvs(size=2000, *arg)
supp = np.unique(rvs)
@ -28,15 +36,19 @@ def test_discrete_basic():
yield check_edge_support, distfn, arg
alpha = 0.01
yield check_discrete_chisquare, distfn, arg, rvs, alpha, \
distname + ' chisquare'
yield (check_discrete_chisquare, distfn, arg, rvs, alpha,
distname + ' chisquare')
seen = set()
for distname, arg in distdiscrete:
if distname in seen:
continue
seen.add(distname)
distfn = getattr(stats,distname)
try:
distfn = getattr(stats, distname)
except TypeError:
distfn = distname
distname = 'sample distribution'
locscale_defaults = (0,)
meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf,
distfn.logsf]
@ -44,7 +56,10 @@ def test_discrete_basic():
spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0, }
k = spec_k.get(distname, 1)
yield check_named_args, distfn, k, arg, locscale_defaults, meths
yield check_scale_docstring, distfn
if distname != 'sample distribution':
yield check_scale_docstring, distfn
yield check_random_state_property, distfn, arg
yield check_pickling, distfn, arg
# Entropy
yield check_entropy, distfn, arg, distname
@ -54,7 +69,11 @@ def test_discrete_basic():
def test_moments():
for distname, arg in distdiscrete:
distfn = getattr(stats,distname)
try:
distfn = getattr(stats, distname)
except TypeError:
distfn = distname
distname = 'sample distribution'
m, v, s, k = distfn.stats(*arg, moments='mvsk')
yield check_normalization, distfn, arg, distname
@ -64,7 +83,7 @@ def test_moments():
yield check_var_expect, distfn, arg, m, v, distname
yield check_skew_expect, distfn, arg, m, v, s, distname
cond = False #distname in ['zipf']
cond = distname in ['zipf']
msg = distname + ' fails kurtosis'
yield knf(cond, msg)(check_kurt_expect), distfn, arg, m, v, k, distname
@ -81,35 +100,36 @@ def check_cdf_ppf(distfn, arg, supp, msg):
supp, msg + '-roundtrip')
supp1 = supp[supp < distfn.b]
npt.assert_array_equal(distfn.ppf(distfn.cdf(supp1, *arg) + 1e-8, *arg),
supp1 + distfn.inc, msg + 'ppf-cdf-next')
supp1 + distfn.inc, msg + 'ppf-cdf-next')
# -1e-8 could cause an error if pmf < 1e-8
def check_pmf_cdf(distfn, arg, distname):
startind = np.int(distfn.ppf(0.01, *arg) - 1)
startind = int(distfn.ppf(0.01, *arg) - 1)
index = list(range(startind, startind + 10))
cdfs, pmfs_cum = distfn.cdf(index,*arg), distfn.pmf(index, *arg).cumsum()
cdfs = distfn.cdf(index, *arg)
pmfs_cum = distfn.pmf(index, *arg).cumsum()
atol, rtol = 1e-10, 1e-10
if distname == 'skellam': # ncx2 accuracy
atol, rtol = 1e-5, 1e-5
npt.assert_allclose(cdfs - cdfs[0], pmfs_cum - pmfs_cum[0],
atol=atol, rtol=rtol)
atol=atol, rtol=rtol)
def check_moment_frozen(distfn, arg, m, k):
npt.assert_allclose(distfn(*arg).moment(k), m,
atol=1e-10, rtol=1e-10)
atol=1e-10, rtol=1e-10)
def check_oth(distfn, arg, supp, msg):
# checking other methods of distfn
npt.assert_allclose(distfn.sf(supp, *arg), 1. - distfn.cdf(supp, *arg),
atol=1e-10, rtol=1e-10)
atol=1e-10, rtol=1e-10)
q = np.linspace(0.01, 0.99, 20)
npt.assert_allclose(distfn.isf(q, *arg), distfn.ppf(1. - q, *arg),
atol=1e-10, rtol=1e-10)
atol=1e-10, rtol=1e-10)
median_sf = distfn.isf(0.5, *arg)
npt.assert_(distfn.sf(median_sf - 1, *arg) > 0.5)
@ -133,44 +153,41 @@ def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
result : bool
0 if test passes, 1 if test fails
uses global variable debug for printing results
"""
n = len(rvs)
nsupp = 20
wsupp = 1.0/nsupp
wsupp = 0.05
# construct intervals with minimum mass 1/nsupp
# construct intervals with minimum mass `wsupp`.
# intervals are left-half-open as in a cdf difference
distsupport = xrange(max(distfn.a, -1000), min(distfn.b, 1000) + 1)
lo = max(distfn.a, -1000)
distsupport = xrange(lo, min(distfn.b, 1000) + 1)
last = 0
distsupp = [max(distfn.a, -1000)]
distsupp = [lo]
distmass = []
for ii in distsupport:
current = distfn.cdf(ii,*arg)
if current - last >= wsupp-1e-14:
current = distfn.cdf(ii, *arg)
if current - last >= wsupp - 1e-14:
distsupp.append(ii)
distmass.append(current - last)
last = current
if current > (1-wsupp):
if current > (1 - wsupp):
break
if distsupp[-1] < distfn.b:
distsupp.append(distfn.b)
distmass.append(1-last)
distmass.append(1 - last)
distsupp = np.array(distsupp)
distmass = np.array(distmass)
# convert intervals to right-half-open as required by histogram
histsupp = distsupp+1e-8
histsupp = distsupp + 1e-8
histsupp[0] = distfn.a
# find sample frequencies and perform chisquare test
freq,hsupp = np.histogram(rvs,histsupp)
cdfs = distfn.cdf(distsupp,*arg)
(chis,pval) = stats.chisquare(np.array(freq),n*distmass)
freq, hsupp = np.histogram(rvs, histsupp)
chis, pval = stats.chisquare(np.array(freq), len(rvs)*distmass)
npt.assert_(pval > alpha, 'chisquare - test for %s'
' at arg = %s with pval = %s' % (msg,str(arg),str(pval)))
npt.assert_(pval > alpha,
'chisquare - test for %s at arg = %s with pval = %s' %
(msg, str(arg), str(pval)))
def check_scale_docstring(distfn):

File diff suppressed because it is too large Load Diff

@ -3,7 +3,7 @@ from __future__ import division, print_function, absolute_import
import os
import numpy as np
from numpy.testing import dec
from numpy.testing import dec, assert_allclose
from wafo import stats
@ -13,12 +13,12 @@ from wafo.stats.tests.test_continuous_basic import distcont
# verifies that the estimate and true values don't differ by too much
fit_sizes = [1000, 5000] # sample sizes to try
thresh_percent = 0.25 # percent of true parameters for fail cut-off
thresh_min = 0.75 # minimum difference estimate - true to fail test
failing_fits = [
'burr',
'chi',
'chi2',
'gausshyper',
'genexpon',
@ -107,5 +107,18 @@ def check_cont_fit(distname,arg):
raise AssertionError('fit not very good in %s\n' % distfn.name + txt)
def _check_loc_scale_mle_fit(name, data, desired, atol=None):
d = getattr(stats, name)
actual = d.fit(data)[-2:]
assert_allclose(actual, desired, atol=atol,
err_msg='poor mle fit of (loc, scale) in %s' % name)
def test_non_default_loc_scale_mle_fit():
data = np.array([1.01, 1.78, 1.78, 1.78, 1.88, 1.88, 1.88, 2.00])
yield _check_loc_scale_mle_fit, 'uniform', data, [1.01, 0.99], 1e-3
yield _check_loc_scale_mle_fit, 'expon', data, [1.01, 0.73875], 1e-3
if __name__ == "__main__":
np.testing.run_module_suite()

@ -1,202 +0,0 @@
from __future__ import division, print_function, absolute_import
from wafo import stats
import numpy as np
from numpy.testing import assert_almost_equal, assert_, assert_raises, \
assert_array_almost_equal, assert_array_almost_equal_nulp, run_module_suite
def test_kde_1d():
#some basic tests comparing to normal distribution
np.random.seed(8765678)
n_basesample = 500
xn = np.random.randn(n_basesample)
xnmean = xn.mean()
xnstd = xn.std(ddof=1)
# get kde for original sample
gkde = stats.gaussian_kde(xn)
# evaluate the density function for the kde for some points
xs = np.linspace(-7,7,501)
kdepdf = gkde.evaluate(xs)
normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
intervall = xs[1] - xs[0]
assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
prob1 = gkde.integrate_box_1d(xnmean, np.inf)
prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
assert_almost_equal(prob1, 0.5, decimal=1)
assert_almost_equal(prob2, 0.5, decimal=1)
assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
assert_almost_equal(gkde.integrate_kde(gkde),
(kdepdf**2).sum()*intervall, decimal=2)
assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
(kdepdf*normpdf).sum()*intervall, decimal=2)
def test_kde_bandwidth_method():
def scotts_factor(kde_obj):
"""Same as default, just check that it works."""
return np.power(kde_obj.n, -1./(kde_obj.d+4))
np.random.seed(8765678)
n_basesample = 50
xn = np.random.randn(n_basesample)
# Default
gkde = stats.gaussian_kde(xn)
# Supply a callable
gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
# Supply a scalar
gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
xs = np.linspace(-7,7,51)
kdepdf = gkde.evaluate(xs)
kdepdf2 = gkde2.evaluate(xs)
assert_almost_equal(kdepdf, kdepdf2)
kdepdf3 = gkde3.evaluate(xs)
assert_almost_equal(kdepdf, kdepdf3)
assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
# Subclasses that should stay working (extracted from various sources).
# Unfortunately the earlier design of gaussian_kde made it necessary for users
# to create these kinds of subclasses, or call _compute_covariance() directly.
class _kde_subclass1(stats.gaussian_kde):
def __init__(self, dataset):
self.dataset = np.atleast_2d(dataset)
self.d, self.n = self.dataset.shape
self.covariance_factor = self.scotts_factor
self._compute_covariance()
class _kde_subclass2(stats.gaussian_kde):
def __init__(self, dataset):
self.covariance_factor = self.scotts_factor
super(_kde_subclass2, self).__init__(dataset)
class _kde_subclass3(stats.gaussian_kde):
def __init__(self, dataset, covariance):
self.covariance = covariance
stats.gaussian_kde.__init__(self, dataset)
def _compute_covariance(self):
self.inv_cov = np.linalg.inv(self.covariance)
self._norm_factor = np.sqrt(np.linalg.det(2*np.pi * self.covariance)) \
* self.n
class _kde_subclass4(stats.gaussian_kde):
def covariance_factor(self):
return 0.5 * self.silverman_factor()
def test_gaussian_kde_subclassing():
x1 = np.array([-7, -5, 1, 4, 5], dtype=np.float)
xs = np.linspace(-10, 10, num=50)
# gaussian_kde itself
kde = stats.gaussian_kde(x1)
ys = kde(xs)
# subclass 1
kde1 = _kde_subclass1(x1)
y1 = kde1(xs)
assert_array_almost_equal_nulp(ys, y1, nulp=10)
# subclass 2
kde2 = _kde_subclass2(x1)
y2 = kde2(xs)
assert_array_almost_equal_nulp(ys, y2, nulp=10)
# subclass 3
kde3 = _kde_subclass3(x1, kde.covariance)
y3 = kde3(xs)
assert_array_almost_equal_nulp(ys, y3, nulp=10)
# subclass 4
kde4 = _kde_subclass4(x1)
y4 = kde4(x1)
y_expected = [0.06292987, 0.06346938, 0.05860291, 0.08657652, 0.07904017]
assert_array_almost_equal(y_expected, y4, decimal=6)
# Not a subclass, but check for use of _compute_covariance()
kde5 = kde
kde5.covariance_factor = lambda: kde.factor
kde5._compute_covariance()
y5 = kde5(xs)
assert_array_almost_equal_nulp(ys, y5, nulp=10)
def test_gaussian_kde_covariance_caching():
x1 = np.array([-7, -5, 1, 4, 5], dtype=np.float)
xs = np.linspace(-10, 10, num=5)
# These expected values are from scipy 0.10, before some changes to
# gaussian_kde. They were not compared with any external reference.
y_expected = [0.02463386, 0.04689208, 0.05395444, 0.05337754, 0.01664475]
# Set the bandwidth, then reset it to the default.
kde = stats.gaussian_kde(x1)
kde.set_bandwidth(bw_method=0.5)
kde.set_bandwidth(bw_method='scott')
y2 = kde(xs)
assert_array_almost_equal(y_expected, y2, decimal=7)
def test_gaussian_kde_monkeypatch():
"""Ugly, but people may rely on this. See scipy pull request 123,
specifically the linked ML thread "Width of the Gaussian in stats.kde".
If it is necessary to break this later on, that is to be discussed on ML.
"""
x1 = np.array([-7, -5, 1, 4, 5], dtype=np.float)
xs = np.linspace(-10, 10, num=50)
# The old monkeypatched version to get at Silverman's Rule.
kde = stats.gaussian_kde(x1)
kde.covariance_factor = kde.silverman_factor
kde._compute_covariance()
y1 = kde(xs)
# The new saner version.
kde2 = stats.gaussian_kde(x1, bw_method='silverman')
y2 = kde2(xs)
assert_array_almost_equal_nulp(y1, y2, nulp=10)
def test_kde_integer_input():
"""Regression test for #1181."""
x1 = np.arange(5)
kde = stats.gaussian_kde(x1)
y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869, 0.13480721]
assert_array_almost_equal(kde(x1), y_expected, decimal=6)
def test_pdf_logpdf():
np.random.seed(1)
n_basesample = 50
xn = np.random.randn(n_basesample)
# Default
gkde = stats.gaussian_kde(xn)
xs = np.linspace(-15, 12, 25)
pdf = gkde.evaluate(xs)
pdf2 = gkde.pdf(xs)
assert_almost_equal(pdf, pdf2, decimal=12)
logpdf = np.log(pdf)
logpdf2 = gkde.logpdf(xs)
assert_almost_equal(logpdf, logpdf2, decimal=12)
if __name__ == "__main__":
run_module_suite()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,107 +0,0 @@
# pylint: disable-msg=W0611, W0612, W0511,R0201
"""Tests suite for maskedArray statistics.
:author: Pierre Gerard-Marchant
:contact: pierregm_at_uga_dot_edu
"""
from __future__ import division, print_function, absolute_import
__author__ = "Pierre GF Gerard-Marchant ($Author: backtopop $)"
import numpy as np
import numpy.ma as ma
import wafo.stats.mstats as ms
#import wafo.stats.mmorestats as mms
from numpy.testing import TestCase, run_module_suite, assert_equal, \
assert_almost_equal, assert_
class TestMisc(TestCase):
def __init__(self, *args, **kwargs):
TestCase.__init__(self, *args, **kwargs)
def test_mjci(self):
"Tests the Marits-Jarrett estimator"
data = ma.array([77, 87, 88,114,151,210,219,246,253,262,
296,299,306,376,428,515,666,1310,2611])
assert_almost_equal(ms.mjci(data),[55.76819,45.84028,198.87875],5)
def test_trimmedmeanci(self):
"Tests the confidence intervals of the trimmed mean."
data = ma.array([545,555,558,572,575,576,578,580,
594,605,635,651,653,661,666])
assert_almost_equal(ms.trimmed_mean(data,0.2), 596.2, 1)
assert_equal(np.round(ms.trimmed_mean_ci(data,(0.2,0.2)),1),
[561.8, 630.6])
def test_idealfourths(self):
"Tests ideal-fourths"
test = np.arange(100)
assert_almost_equal(np.asarray(ms.idealfourths(test)),
[24.416667,74.583333],6)
test_2D = test.repeat(3).reshape(-1,3)
assert_almost_equal(ms.idealfourths(test_2D, axis=0),
[[24.416667,24.416667,24.416667],
[74.583333,74.583333,74.583333]],6)
assert_almost_equal(ms.idealfourths(test_2D, axis=1),
test.repeat(2).reshape(-1,2))
test = [0,0]
_result = ms.idealfourths(test)
assert_(np.isnan(_result).all())
#..............................................................................
class TestQuantiles(TestCase):
def __init__(self, *args, **kwargs):
TestCase.__init__(self, *args, **kwargs)
def test_hdquantiles(self):
data = [0.706560797,0.727229578,0.990399276,0.927065621,0.158953014,
0.887764025,0.239407086,0.349638551,0.972791145,0.149789972,
0.936947700,0.132359948,0.046041972,0.641675031,0.945530547,
0.224218684,0.771450991,0.820257774,0.336458052,0.589113496,
0.509736129,0.696838829,0.491323573,0.622767425,0.775189248,
0.641461450,0.118455200,0.773029450,0.319280007,0.752229111,
0.047841438,0.466295911,0.583850781,0.840581845,0.550086491,
0.466470062,0.504765074,0.226855960,0.362641207,0.891620942,
0.127898691,0.490094097,0.044882048,0.041441695,0.317976349,
0.504135618,0.567353033,0.434617473,0.636243375,0.231803616,
0.230154113,0.160011327,0.819464108,0.854706985,0.438809221,
0.487427267,0.786907310,0.408367937,0.405534192,0.250444460,
0.995309248,0.144389588,0.739947527,0.953543606,0.680051621,
0.388382017,0.863530727,0.006514031,0.118007779,0.924024803,
0.384236354,0.893687694,0.626534881,0.473051932,0.750134705,
0.241843555,0.432947602,0.689538104,0.136934797,0.150206859,
0.474335206,0.907775349,0.525869295,0.189184225,0.854284286,
0.831089744,0.251637345,0.587038213,0.254475554,0.237781276,
0.827928620,0.480283781,0.594514455,0.213641488,0.024194386,
0.536668589,0.699497811,0.892804071,0.093835427,0.731107772]
#
assert_almost_equal(ms.hdquantiles(data,[0., 1.]),
[0.006514031, 0.995309248])
hdq = ms.hdquantiles(data,[0.25, 0.5, 0.75])
assert_almost_equal(hdq, [0.253210762, 0.512847491, 0.762232442,])
hdq = ms.hdquantiles_sd(data,[0.25, 0.5, 0.75])
assert_almost_equal(hdq, [0.03786954, 0.03805389, 0.03800152,], 4)
#
data = np.array(data).reshape(10,10)
hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0)
assert_almost_equal(hdq[:,0], ms.hdquantiles(data[:,0],[0.25,0.5,0.75]))
assert_almost_equal(hdq[:,-1], ms.hdquantiles(data[:,-1],[0.25,0.5,0.75]))
hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0,var=True)
assert_almost_equal(hdq[...,0],
ms.hdquantiles(data[:,0],[0.25,0.5,0.75],var=True))
assert_almost_equal(hdq[...,-1],
ms.hdquantiles(data[:,-1],[0.25,0.5,0.75], var=True))
###############################################################################
if __name__ == "__main__":
run_module_suite()

@ -1,485 +0,0 @@
"""
Test functions for multivariate normal distributions.
"""
from __future__ import division, print_function, absolute_import
from numpy.testing import (
assert_allclose,
assert_almost_equal,
assert_array_almost_equal,
assert_equal,
assert_raises,
run_module_suite,
)
import numpy
import numpy as np
import scipy.linalg
from wafo.stats._multivariate import _PSD, _lnB
from wafo.stats import multivariate_normal
from wafo.stats import dirichlet, beta
from wafo.stats import norm
from scipy.integrate import romb
def test_input_shape():
mu = np.arange(3)
cov = np.identity(2)
assert_raises(ValueError, multivariate_normal.pdf, (0, 1), mu, cov)
assert_raises(ValueError, multivariate_normal.pdf, (0, 1, 2), mu, cov)
def test_scalar_values():
np.random.seed(1234)
# When evaluated on scalar data, the pdf should return a scalar
x, mean, cov = 1.5, 1.7, 2.5
pdf = multivariate_normal.pdf(x, mean, cov)
assert_equal(pdf.ndim, 0)
# When evaluated on a single vector, the pdf should return a scalar
x = np.random.randn(5)
mean = np.random.randn(5)
cov = np.abs(np.random.randn(5)) # Diagonal values for cov. matrix
pdf = multivariate_normal.pdf(x, mean, cov)
assert_equal(pdf.ndim, 0)
def test_logpdf():
# Check that the log of the pdf is in fact the logpdf
np.random.seed(1234)
x = np.random.randn(5)
mean = np.random.randn(5)
cov = np.abs(np.random.randn(5))
d1 = multivariate_normal.logpdf(x, mean, cov)
d2 = multivariate_normal.pdf(x, mean, cov)
assert_allclose(d1, np.log(d2))
def test_rank():
# Check that the rank is detected correctly.
np.random.seed(1234)
n = 4
mean = np.random.randn(n)
for expected_rank in range(1, n + 1):
s = np.random.randn(n, expected_rank)
cov = np.dot(s, s.T)
distn = multivariate_normal(mean, cov, allow_singular=True)
assert_equal(distn.cov_info.rank, expected_rank)
def _sample_orthonormal_matrix(n):
M = np.random.randn(n, n)
u, s, v = scipy.linalg.svd(M)
return u
def test_degenerate_distributions():
for n in range(1, 5):
x = np.random.randn(n)
for k in range(1, n + 1):
# Sample a small covariance matrix.
s = np.random.randn(k, k)
cov_kk = np.dot(s, s.T)
# Embed the small covariance matrix into a larger low rank matrix.
cov_nn = np.zeros((n, n))
cov_nn[:k, :k] = cov_kk
# Define a rotation of the larger low rank matrix.
u = _sample_orthonormal_matrix(n)
cov_rr = np.dot(u, np.dot(cov_nn, u.T))
y = np.dot(u, x)
# Check some identities.
distn_kk = multivariate_normal(np.zeros(k), cov_kk,
allow_singular=True)
distn_nn = multivariate_normal(np.zeros(n), cov_nn,
allow_singular=True)
distn_rr = multivariate_normal(np.zeros(n), cov_rr,
allow_singular=True)
assert_equal(distn_kk.cov_info.rank, k)
assert_equal(distn_nn.cov_info.rank, k)
assert_equal(distn_rr.cov_info.rank, k)
pdf_kk = distn_kk.pdf(x[:k])
pdf_nn = distn_nn.pdf(x)
pdf_rr = distn_rr.pdf(y)
assert_allclose(pdf_kk, pdf_nn)
assert_allclose(pdf_kk, pdf_rr)
logpdf_kk = distn_kk.logpdf(x[:k])
logpdf_nn = distn_nn.logpdf(x)
logpdf_rr = distn_rr.logpdf(y)
assert_allclose(logpdf_kk, logpdf_nn)
assert_allclose(logpdf_kk, logpdf_rr)
def test_large_pseudo_determinant():
# Check that large pseudo-determinants are handled appropriately.
# Construct a singular diagonal covariance matrix
# whose pseudo determinant overflows double precision.
large_total_log = 1000.0
npos = 100
nzero = 2
large_entry = np.exp(large_total_log / npos)
n = npos + nzero
cov = np.zeros((n, n), dtype=float)
np.fill_diagonal(cov, large_entry)
cov[-nzero:, -nzero:] = 0
# Check some determinants.
assert_equal(scipy.linalg.det(cov), 0)
assert_equal(scipy.linalg.det(cov[:npos, :npos]), np.inf)
# np.linalg.slogdet is only available in numpy 1.6+
# but scipy currently supports numpy 1.5.1.
# assert_allclose(np.linalg.slogdet(cov[:npos, :npos]),
# (1, large_total_log))
# Check the pseudo-determinant.
psd = _PSD(cov)
assert_allclose(psd.log_pdet, large_total_log)
def test_broadcasting():
np.random.seed(1234)
n = 4
# Construct a random covariance matrix.
data = np.random.randn(n, n)
cov = np.dot(data, data.T)
mean = np.random.randn(n)
# Construct an ndarray which can be interpreted as
# a 2x3 array whose elements are random data vectors.
X = np.random.randn(2, 3, n)
# Check that multiple data points can be evaluated at once.
for i in range(2):
for j in range(3):
actual = multivariate_normal.pdf(X[i, j], mean, cov)
desired = multivariate_normal.pdf(X, mean, cov)[i, j]
assert_allclose(actual, desired)
def test_normal_1D():
# The probability density function for a 1D normal variable should
# agree with the standard normal distribution in scipy.stats.distributions
x = np.linspace(0, 2, 10)
mean, cov = 1.2, 0.9
scale = cov**0.5
d1 = norm.pdf(x, mean, scale)
d2 = multivariate_normal.pdf(x, mean, cov)
assert_allclose(d1, d2)
def test_marginalization():
# Integrating out one of the variables of a 2D Gaussian should
# yield a 1D Gaussian
mean = np.array([2.5, 3.5])
cov = np.array([[.5, 0.2], [0.2, .6]])
n = 2 ** 8 + 1 # Number of samples
delta = 6 / (n - 1) # Grid spacing
v = np.linspace(0, 6, n)
xv, yv = np.meshgrid(v, v)
pos = np.empty((n, n, 2))
pos[:, :, 0] = xv
pos[:, :, 1] = yv
pdf = multivariate_normal.pdf(pos, mean, cov)
# Marginalize over x and y axis
margin_x = romb(pdf, delta, axis=0)
margin_y = romb(pdf, delta, axis=1)
# Compare with standard normal distribution
gauss_x = norm.pdf(v, loc=mean[0], scale=cov[0, 0] ** 0.5)
gauss_y = norm.pdf(v, loc=mean[1], scale=cov[1, 1] ** 0.5)
assert_allclose(margin_x, gauss_x, rtol=1e-2, atol=1e-2)
assert_allclose(margin_y, gauss_y, rtol=1e-2, atol=1e-2)
def test_frozen():
# The frozen distribution should agree with the regular one
np.random.seed(1234)
x = np.random.randn(5)
mean = np.random.randn(5)
cov = np.abs(np.random.randn(5))
norm_frozen = multivariate_normal(mean, cov)
assert_allclose(norm_frozen.pdf(x), multivariate_normal.pdf(x, mean, cov))
assert_allclose(norm_frozen.logpdf(x),
multivariate_normal.logpdf(x, mean, cov))
def test_pseudodet_pinv():
# Make sure that pseudo-inverse and pseudo-det agree on cutoff
# Assemble random covariance matrix with large and small eigenvalues
np.random.seed(1234)
n = 7
x = np.random.randn(n, n)
cov = np.dot(x, x.T)
s, u = scipy.linalg.eigh(cov)
s = 0.5 * np.ones(n)
s[0] = 1.0
s[-1] = 1e-7
cov = np.dot(u, np.dot(np.diag(s), u.T))
# Set cond so that the lowest eigenvalue is below the cutoff
cond = 1e-5
psd = _PSD(cov, cond=cond)
psd_pinv = _PSD(psd.pinv, cond=cond)
# Check that the log pseudo-determinant agrees with the sum
# of the logs of all but the smallest eigenvalue
assert_allclose(psd.log_pdet, np.sum(np.log(s[:-1])))
# Check that the pseudo-determinant of the pseudo-inverse
# agrees with 1 / pseudo-determinant
assert_allclose(-psd.log_pdet, psd_pinv.log_pdet)
def test_exception_nonsquare_cov():
cov = [[1, 2, 3], [4, 5, 6]]
assert_raises(ValueError, _PSD, cov)
def test_exception_nonfinite_cov():
cov_nan = [[1, 0], [0, np.nan]]
assert_raises(ValueError, _PSD, cov_nan)
cov_inf = [[1, 0], [0, np.inf]]
assert_raises(ValueError, _PSD, cov_inf)
def test_exception_non_psd_cov():
cov = [[1, 0], [0, -1]]
assert_raises(ValueError, _PSD, cov)
def test_exception_singular_cov():
np.random.seed(1234)
x = np.random.randn(5)
mean = np.random.randn(5)
cov = np.ones((5, 5))
e = np.linalg.LinAlgError
assert_raises(e, multivariate_normal, mean, cov)
assert_raises(e, multivariate_normal.pdf, x, mean, cov)
assert_raises(e, multivariate_normal.logpdf, x, mean, cov)
def test_R_values():
# Compare the multivariate pdf with some values precomputed
# in R version 3.0.1 (2013-05-16) on Mac OS X 10.6.
# The values below were generated by the following R-script:
# > library(mnormt)
# > x <- seq(0, 2, length=5)
# > y <- 3*x - 2
# > z <- x + cos(y)
# > mu <- c(1, 3, 2)
# > Sigma <- matrix(c(1,2,0,2,5,0.5,0,0.5,3), 3, 3)
# > r_pdf <- dmnorm(cbind(x,y,z), mu, Sigma)
r_pdf = np.array([0.0002214706, 0.0013819953, 0.0049138692,
0.0103803050, 0.0140250800])
x = np.linspace(0, 2, 5)
y = 3 * x - 2
z = x + np.cos(y)
r = np.array([x, y, z]).T
mean = np.array([1, 3, 2], 'd')
cov = np.array([[1, 2, 0], [2, 5, .5], [0, .5, 3]], 'd')
pdf = multivariate_normal.pdf(r, mean, cov)
assert_allclose(pdf, r_pdf, atol=1e-10)
def test_multivariate_normal_rvs_zero_covariance():
mean = np.zeros(2)
covariance = np.zeros((2, 2))
model = multivariate_normal(mean, covariance, allow_singular=True)
sample = model.rvs()
assert_equal(sample, [0, 0])
def test_rvs_shape():
# Check that rvs parses the mean and covariance correctly, and returns
# an array of the right shape
N = 300
d = 4
sample = multivariate_normal.rvs(mean=np.zeros(d), cov=1, size=N)
assert_equal(sample.shape, (N, d))
sample = multivariate_normal.rvs(mean=None,
cov=np.array([[2, .1], [.1, 1]]),
size=N)
assert_equal(sample.shape, (N, 2))
u = multivariate_normal(mean=0, cov=1)
sample = u.rvs(N)
assert_equal(sample.shape, (N, ))
def test_large_sample():
# Generate large sample and compare sample mean and sample covariance
# with mean and covariance matrix.
np.random.seed(2846)
n = 3
mean = np.random.randn(n)
M = np.random.randn(n, n)
cov = np.dot(M, M.T)
size = 5000
sample = multivariate_normal.rvs(mean, cov, size)
assert_allclose(numpy.cov(sample.T), cov, rtol=1e-1)
assert_allclose(sample.mean(0), mean, rtol=1e-1)
def test_entropy():
np.random.seed(2846)
n = 3
mean = np.random.randn(n)
M = np.random.randn(n, n)
cov = np.dot(M, M.T)
rv = multivariate_normal(mean, cov)
# Check that frozen distribution agrees with entropy function
assert_almost_equal(rv.entropy(), multivariate_normal.entropy(mean, cov))
# Compare entropy with manually computed expression involving
# the sum of the logs of the eigenvalues of the covariance matrix
eigs = np.linalg.eig(cov)[0]
desired = 1 / 2 * (n * (np.log(2 * np.pi) + 1) + np.sum(np.log(eigs)))
assert_almost_equal(desired, rv.entropy())
def test_lnB():
alpha = np.array([1, 1, 1])
desired = .5 # e^lnB = 1/2 for [1, 1, 1]
assert_almost_equal(np.exp(_lnB(alpha)), desired)
def test_frozen_dirichlet():
np.random.seed(2846)
n = np.random.randint(1, 32)
alpha = np.random.uniform(10e-10, 100, n)
d = dirichlet(alpha)
assert_equal(d.var(), dirichlet.var(alpha))
assert_equal(d.mean(), dirichlet.mean(alpha))
assert_equal(d.entropy(), dirichlet.entropy(alpha))
num_tests = 10
for i in range(num_tests):
x = np.random.uniform(10e-10, 100, n)
x /= np.sum(x)
assert_equal(d.pdf(x[:-1]), dirichlet.pdf(x[:-1], alpha))
assert_equal(d.logpdf(x[:-1]), dirichlet.logpdf(x[:-1], alpha))
def test_simple_values():
alpha = np.array([1, 1])
d = dirichlet(alpha)
assert_almost_equal(d.mean(), 0.5)
assert_almost_equal(d.var(), 1. / 12.)
b = beta(1, 1)
assert_almost_equal(d.mean(), b.mean())
assert_almost_equal(d.var(), b.var())
def test_K_and_K_minus_1_calls_equal():
# Test that calls with K and K-1 entries yield the same results.
np.random.seed(2846)
n = np.random.randint(1, 32)
alpha = np.random.uniform(10e-10, 100, n)
d = dirichlet(alpha)
num_tests = 10
for i in range(num_tests):
x = np.random.uniform(10e-10, 100, n)
x /= np.sum(x)
assert_almost_equal(d.pdf(x[:-1]), d.pdf(x))
def test_multiple_entry_calls():
# Test that calls with multiple x vectors as matrix work
np.random.seed(2846)
n = np.random.randint(1, 32)
alpha = np.random.uniform(10e-10, 100, n)
d = dirichlet(alpha)
num_tests = 10
num_multiple = 5
xm = None
for i in range(num_tests):
for m in range(num_multiple):
x = np.random.uniform(10e-10, 100, n)
x /= np.sum(x)
if xm is not None:
xm = np.vstack((xm, x))
else:
xm = x
rm = d.pdf(xm.T)
rs = None
for xs in xm:
r = d.pdf(xs)
if rs is not None:
rs = np.append(rs, r)
else:
rs = r
assert_array_almost_equal(rm, rs)
def test_2D_dirichlet_is_beta():
np.random.seed(2846)
alpha = np.random.uniform(10e-10, 100, 2)
d = dirichlet(alpha)
b = beta(alpha[0], alpha[1])
num_tests = 10
for i in range(num_tests):
x = np.random.uniform(10e-10, 100, 2)
x /= np.sum(x)
assert_almost_equal(b.pdf(x), d.pdf([x]))
assert_almost_equal(b.mean(), d.mean()[0])
assert_almost_equal(b.var(), d.var()[0])
def test_dimensions_mismatch():
# Regression test for GH #3493. Check that setting up a PDF with a mean of
# length M and a covariance matrix of size (N, N), where M != N, raises a
# ValueError with an informative error message.
mu = np.array([0.0, 0.0])
sigma = np.array([[1.0]])
assert_raises(ValueError, multivariate_normal, mu, sigma)
# A simple check that the right error message was passed along. Checking
# that the entire message is there, word for word, would be somewhat
# fragile, so we just check for the leading part.
try:
multivariate_normal(mu, sigma)
except ValueError as e:
msg = "Dimension mismatch"
assert_equal(str(e)[:len(msg)], msg)
if __name__ == "__main__":
run_module_suite()

@ -1,193 +0,0 @@
from __future__ import division, print_function, absolute_import
import numpy as np
from numpy.testing import TestCase, run_module_suite, assert_equal, \
assert_array_equal
from wafo.stats import rankdata, tiecorrect
class TestTieCorrect(TestCase):
def test_empty(self):
"""An empty array requires no correction, should return 1.0."""
ranks = np.array([], dtype=np.float64)
c = tiecorrect(ranks)
assert_equal(c, 1.0)
def test_one(self):
"""A single element requires no correction, should return 1.0."""
ranks = np.array([1.0], dtype=np.float64)
c = tiecorrect(ranks)
assert_equal(c, 1.0)
def test_no_correction(self):
"""Arrays with no ties require no correction."""
ranks = np.arange(2.0)
c = tiecorrect(ranks)
assert_equal(c, 1.0)
ranks = np.arange(3.0)
c = tiecorrect(ranks)
assert_equal(c, 1.0)
def test_basic(self):
"""Check a few basic examples of the tie correction factor."""
# One tie of two elements
ranks = np.array([1.0, 2.5, 2.5])
c = tiecorrect(ranks)
T = 2.0
N = ranks.size
expected = 1.0 - (T**3 - T) / (N**3 - N)
assert_equal(c, expected)
# One tie of two elements (same as above, but tie is not at the end)
ranks = np.array([1.5, 1.5, 3.0])
c = tiecorrect(ranks)
T = 2.0
N = ranks.size
expected = 1.0 - (T**3 - T) / (N**3 - N)
assert_equal(c, expected)
# One tie of three elements
ranks = np.array([1.0, 3.0, 3.0, 3.0])
c = tiecorrect(ranks)
T = 3.0
N = ranks.size
expected = 1.0 - (T**3 - T) / (N**3 - N)
assert_equal(c, expected)
# Two ties, lengths 2 and 3.
ranks = np.array([1.5, 1.5, 4.0, 4.0, 4.0])
c = tiecorrect(ranks)
T1 = 2.0
T2 = 3.0
N = ranks.size
expected = 1.0 - ((T1**3 - T1) + (T2**3 - T2)) / (N**3 - N)
assert_equal(c, expected)
class TestRankData(TestCase):
def test_empty(self):
"""stats.rankdata([]) should return an empty array."""
a = np.array([], dtype=np.int)
r = rankdata(a)
assert_array_equal(r, np.array([], dtype=np.float64))
r = rankdata([])
assert_array_equal(r, np.array([], dtype=np.float64))
def test_one(self):
"""Check stats.rankdata with an array of length 1."""
data = [100]
a = np.array(data, dtype=np.int)
r = rankdata(a)
assert_array_equal(r, np.array([1.0], dtype=np.float64))
r = rankdata(data)
assert_array_equal(r, np.array([1.0], dtype=np.float64))
def test_basic(self):
"""Basic tests of stats.rankdata."""
data = [100, 10, 50]
expected = np.array([3.0, 1.0, 2.0], dtype=np.float64)
a = np.array(data, dtype=np.int)
r = rankdata(a)
assert_array_equal(r, expected)
r = rankdata(data)
assert_array_equal(r, expected)
data = [40, 10, 30, 10, 50]
expected = np.array([4.0, 1.5, 3.0, 1.5, 5.0], dtype=np.float64)
a = np.array(data, dtype=np.int)
r = rankdata(a)
assert_array_equal(r, expected)
r = rankdata(data)
assert_array_equal(r, expected)
data = [20, 20, 20, 10, 10, 10]
expected = np.array([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=np.float64)
a = np.array(data, dtype=np.int)
r = rankdata(a)
assert_array_equal(r, expected)
r = rankdata(data)
assert_array_equal(r, expected)
# The docstring states explicitly that the argument is flattened.
a2d = a.reshape(2, 3)
r = rankdata(a2d)
assert_array_equal(r, expected)
def test_large_int(self):
data = np.array([2**60, 2**60+1], dtype=np.uint64)
r = rankdata(data)
assert_array_equal(r, [1.0, 2.0])
data = np.array([2**60, 2**60+1], dtype=np.int64)
r = rankdata(data)
assert_array_equal(r, [1.0, 2.0])
data = np.array([2**60, -2**60+1], dtype=np.int64)
r = rankdata(data)
assert_array_equal(r, [2.0, 1.0])
def test_big_tie(self):
for n in [10000, 100000, 1000000]:
data = np.ones(n, dtype=int)
r = rankdata(data)
expected_rank = 0.5 * (n + 1)
assert_array_equal(r, expected_rank * data,
"test failed with n=%d" % n)
_cases = (
# values, method, expected
([], 'average', []),
([], 'min', []),
([], 'max', []),
([], 'dense', []),
([], 'ordinal', []),
#
([100], 'average', [1.0]),
([100], 'min', [1.0]),
([100], 'max', [1.0]),
([100], 'dense', [1.0]),
([100], 'ordinal', [1.0]),
#
([100, 100, 100], 'average', [2.0, 2.0, 2.0]),
([100, 100, 100], 'min', [1.0, 1.0, 1.0]),
([100, 100, 100], 'max', [3.0, 3.0, 3.0]),
([100, 100, 100], 'dense', [1.0, 1.0, 1.0]),
([100, 100, 100], 'ordinal', [1.0, 2.0, 3.0]),
#
([100, 300, 200], 'average', [1.0, 3.0, 2.0]),
([100, 300, 200], 'min', [1.0, 3.0, 2.0]),
([100, 300, 200], 'max', [1.0, 3.0, 2.0]),
([100, 300, 200], 'dense', [1.0, 3.0, 2.0]),
([100, 300, 200], 'ordinal', [1.0, 3.0, 2.0]),
#
([100, 200, 300, 200], 'average', [1.0, 2.5, 4.0, 2.5]),
([100, 200, 300, 200], 'min', [1.0, 2.0, 4.0, 2.0]),
([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]),
([100, 200, 300, 200], 'dense', [1.0, 2.0, 3.0, 2.0]),
([100, 200, 300, 200], 'ordinal', [1.0, 2.0, 4.0, 3.0]),
#
([100, 200, 300, 200, 100], 'average', [1.5, 3.5, 5.0, 3.5, 1.5]),
([100, 200, 300, 200, 100], 'min', [1.0, 3.0, 5.0, 3.0, 1.0]),
([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]),
([100, 200, 300, 200, 100], 'dense', [1.0, 2.0, 3.0, 2.0, 1.0]),
([100, 200, 300, 200, 100], 'ordinal', [1.0, 3.0, 5.0, 4.0, 2.0]),
#
([10] * 30, 'ordinal', np.arange(1.0, 31.0)),
)
def test_cases():
def check_case(values, method, expected):
r = rankdata(values, method=method)
assert_array_equal(r, expected)
for values, method, expected in _cases:
yield check_case, values, method, expected
if __name__ == "__main__":
run_module_suite()

File diff suppressed because it is too large Load Diff

@ -1,91 +0,0 @@
from __future__ import division, print_function, absolute_import
import numpy as np
from numpy.testing import assert_allclose, assert_equal, run_module_suite
from scipy.stats._tukeylambda_stats import tukeylambda_variance, \
tukeylambda_kurtosis
def test_tukeylambda_stats_known_exact():
"""Compare results with some known exact formulas."""
# Some exact values of the Tukey Lambda variance and kurtosis:
# lambda var kurtosis
# 0 pi**2/3 6/5 (logistic distribution)
# 0.5 4 - pi (5/3 - pi/2)/(pi/4 - 1)**2 - 3
# 1 1/3 -6/5 (uniform distribution on (-1,1))
# 2 1/12 -6/5 (uniform distribution on (-1/2, 1/2))
# lambda = 0
var = tukeylambda_variance(0)
assert_allclose(var, np.pi**2 / 3, atol=1e-12)
kurt = tukeylambda_kurtosis(0)
assert_allclose(kurt, 1.2, atol=1e-10)
# lambda = 0.5
var = tukeylambda_variance(0.5)
assert_allclose(var, 4 - np.pi, atol=1e-12)
kurt = tukeylambda_kurtosis(0.5)
desired = (5./3 - np.pi/2) / (np.pi/4 - 1)**2 - 3
assert_allclose(kurt, desired, atol=1e-10)
# lambda = 1
var = tukeylambda_variance(1)
assert_allclose(var, 1.0 / 3, atol=1e-12)
kurt = tukeylambda_kurtosis(1)
assert_allclose(kurt, -1.2, atol=1e-10)
# lambda = 2
var = tukeylambda_variance(2)
assert_allclose(var, 1.0 / 12, atol=1e-12)
kurt = tukeylambda_kurtosis(2)
assert_allclose(kurt, -1.2, atol=1e-10)
def test_tukeylambda_stats_mpmath():
"""Compare results with some values that were computed using mpmath."""
a10 = dict(atol=1e-10, rtol=0)
a12 = dict(atol=1e-12, rtol=0)
data = [
# lambda variance kurtosis
[-0.1, 4.78050217874253547, 3.78559520346454510],
[-0.0649, 4.16428023599895777, 2.52019675947435718],
[-0.05, 3.93672267890775277, 2.13129793057777277],
[-0.001, 3.30128380390964882, 1.21452460083542988],
[0.001, 3.27850775649572176, 1.18560634779287585],
[0.03125, 2.95927803254615800, 0.804487555161819980],
[0.05, 2.78281053405464501, 0.611604043886644327],
[0.0649, 2.65282386754100551, 0.476834119532774540],
[1.2, 0.242153920578588346, -1.23428047169049726],
[10.0, 0.00095237579757703597, 2.37810697355144933],
[20.0, 0.00012195121951131043, 7.37654321002709531],
]
for lam, var_expected, kurt_expected in data:
var = tukeylambda_variance(lam)
assert_allclose(var, var_expected, **a12)
kurt = tukeylambda_kurtosis(lam)
assert_allclose(kurt, kurt_expected, **a10)
# Test with vector arguments (most of the other tests are for single
# values).
lam, var_expected, kurt_expected = zip(*data)
var = tukeylambda_variance(lam)
assert_allclose(var, var_expected, **a12)
kurt = tukeylambda_kurtosis(lam)
assert_allclose(kurt, kurt_expected, **a10)
def test_tukeylambda_stats_invalid():
"""Test values of lambda outside the domains of the functions."""
lam = [-1.0, -0.5]
var = tukeylambda_variance(lam)
assert_equal(var, np.array([np.nan, np.inf]))
lam = [-1.0, -0.25]
kurt = tukeylambda_kurtosis(lam)
assert_equal(kurt, np.array([np.nan, np.inf]))
if __name__ == "__main__":
run_module_suite()

@ -1,412 +0,0 @@
"""
Commentary
----------
Most of the work is done by the scipy.stats.distributions module.
This provides a plethora of continuous distributions to play with.
Each distribution has functions to generate random deviates, pdf's,
cdf's etc. as well as a function to fit the distribution to some given
data.
The fitting uses scipy.optimize.fmin to minimise the log odds of the
data given the distribution.
There are a couple of problems with this approach. First it is
sensitive to the initial guess at the parameters. Second it can be a
little slow.
Two key parameters are the 'loc' and 'scale' parameters. Data is
shifted by 'loc' and scaled by scale prior to fitting. Supplying
appropriate values for these parameters is important to getting a good
fit.
See the factory() function which picks from a handful of common
approaches for each distribution.
For some distributions (eg normal) it really makes sense just to
calculate the parameters directly from the data.
The code in the __ifmain__ should be a good guide how to use this.
Simply:
get a QuickFit object
add the distributions you want to try to fit
call fit() with your data
call fit_stats() to generate some stats on the fit.
call plot() if you want to see a plot.
Named after Mrs Twolumps, minister's secretary in the silly walks
sketch, who brings in coffee with a full silly walk.
Tenuous link with curve fitting is that you generally see "two lumps"
one in your data and the other in the curve that is being fitted.
Or alternately, if your data is not too silly then you can fit a
curve to it.
License is GNU LGPL v3, see https://launchpad.net/twolumps
"""
import inspect
from itertools import izip
import numpy
from wafo import stats
from scipy import mean, std
def factory(name):
""" Factory to return appropriate objects for each distro. """
fitters = dict(
beta=ZeroOneScipyDistribution,
alpha=ZeroOneScipyDistribution,
ncf=ZeroOneScipyDistribution,
triang=ZeroOneScipyDistribution,
uniform=ZeroOneScipyDistribution,
powerlaw=ZeroOneScipyDistribution,
pareto=MinLocScipyDistribution,
expon=MinLocScipyDistribution,
gamma=MinLocScipyDistribution,
lognorm=MinLocScipyDistribution,
maxwell=MinLocScipyDistribution,
weibull_min=MinLocScipyDistribution,
weibull_max=MaxLocScipyDistribution)
return fitters.get(name, ScipyDistribution)(name)
def get_continuous_distros():
""" Find all attributes of stats that are continuous distributions. """
fitters = []
skip = set()
for name, item in inspect.getmembers(stats):
if name in skip: continue
if item is stats.rv_continuous: continue
if isinstance(item, stats.rv_continuous):
fitters.append([name, factory(name)])
return fitters
class ScipyDistribution(object):
def __init__(self, name):
self.name = name
self.distro = self.get_distro()
self.fitted = None
def __getattr__(self, attr):
""" Try delegating to the distro object """
return getattr(self.distro, attr)
def get_distro(self):
return getattr(stats, self.name)
def set_distro(self, parms):
self.distro = getattr(stats, self.name)(*parms)
return self.distro
def calculate_loc_and_scale(self, data):
""" Calculate loc and scale parameters for fit.
Depending on the distribution, these need to be approximately
right to get a good fit.
"""
return mean(data), std(data)
def fit(self, data, *args, **kwargs):
""" This needs some work.
Seems the various scipy distributions do a reasonable job if given a good hint.
Need to get distro specific hints.
"""
fits = []
# try with and without providing loc and scale hints
# increases chance of a fit without an exception being
# generated.
for (loc, scale) in ((0.0, 1.0),
self.calculate_loc_and_scale(data)):
try:
parms = self.get_distro().fit(data, loc=loc, scale=scale)
self.set_distro(list(parms))
expected = self.expected(data)
rss = ((expected-data)**2).sum()
fits.append([rss, list(parms)])
parms = self.get_distro().fit(data, floc=loc, scale=scale)
self.set_distro(list(parms))
expected = self.expected(data)
rss = ((expected-data)**2).sum()
fits.append([rss, list(parms)])
except:
pass
# no fits means all tries raised exceptions
if not fits:
raise Exception("Exception in fit()")
# pick the one with the smallest rss
fits.sort()
self.parms = fits[0][1]
print self.parms
return self.set_distro(list(self.parms))
def expected(self, data):
""" Calculate expected values at each data point """
if self.fitted is not None:
return self.fitted
n = len(data)
xx = numpy.linspace(0, 1, n + 2)[1:-1]
self.fitted = self.ppf(xx)
#self.fitted = [self.ppf(x) for x in xx]
return self.fitted
def fit_stats(self, data):
""" Return stats on the fits
data assumed to be sorted.
"""
n = len(data)
dvar = numpy.var(data)
expected = self.expected(data)
evar = numpy.var(expected)
rss = 0.0
for expect, obs in izip(expected, data):
rss += (obs-expect) ** 2.0
self.rss = rss
self.dss = dvar * n
self.fss = evar * n
def residuals(self, data):
""" Return residuals """
expected = self.expected(data)
return numpy.array(data) - numpy.array(expected)
class MinLocScipyDistribution(ScipyDistribution):
def calculate_loc_and_scale(self, data):
""" Set loc to min value in the data.
Useful for weibull_min
"""
return min(data), std(data)
class MaxLocScipyDistribution(ScipyDistribution):
def calculate_loc_and_scale(self, data):
""" Set loc to max value in the data.
Useful for weibull_max
"""
return max(data), std(data)
class ZeroOneScipyDistribution(ScipyDistribution):
def calculate_loc_and_scale(self, data):
""" Set loc and scale to move to [0, 1] interval.
Useful for beta distribution
"""
return min(data), max(data)-min(data)
class QuickFit(object):
""" Fit a family of distributions.
Calculates stats on each fit.
Option to create plots.
"""
def __init__(self):
self.distributions = []
def add_distribution(self, distribution):
""" Add a ready-prepared ScipyDistribution """
self.distributions.append(distribution)
def add(self, name):
""" Add a distribution by name. """
self.distributions.append(factory(name))
def fit(self, data):
""" Fit all of the distros we have """
fitted = []
for distro in self.distributions:
print 'fitting distro', distro.name
try:
distro.fit(data)
except:
continue
fitted.append(distro)
self.distributions = fitted
print 'finished fitting'
def stats(self, data):
""" Return stats on the fits """
for dd in self.distributions:
dd.fit_stats(data)
def get_topn(self, n):
""" Return top-n best fits. """
data = [[x.rss, x] for x in self.distributions if numpy.isfinite(x.rss)]
data.sort()
if not n:
n = len(data)
return [x[1] for x in data[:n]]
def fit_plot(self, data, topn=0, bins=20):
""" Create a plot. """
from matplotlib import pylab as pl
distros = self.get_topn(topn)
xx = numpy.linspace(data.min(), data.max(), 300)
table = []
nparms = max(len(x.parms) for x in distros)
tcolours = []
for dd in distros:
patch = pl.plot(xx, [dd.pdf(p) for p in xx], label='%10.2f%% %s' % (100.0*dd.rss/dd.dss, dd.name))
row = ['', dd.name, '%10.2f%%' % (100.0*dd.rss/dd.dss,)] + ['%0.2f' % x for x in dd.parms]
while len(row) < 3 + nparms:
row.append('')
table.append(row)
tcolours.append([patch[0].get_markerfacecolor()] + ['w'] * (2+nparms))
# add a historgram with the data
pl.hist(data, bins=bins, normed=True)
tab = pl.table(cellText=table, cellColours=tcolours,
colLabels=['', 'Distribution', 'Res. SS/Data SS'] + ['P%d' % (x + 1,) for x in range(nparms)],
bbox=(0.0, 1.0, 1.0, 0.3))
#loc='top'))
#pl.legend(loc=0)
tab.auto_set_font_size(False)
tab.set_fontsize(10.)
def residual_plot(self, data, topn=0):
""" Create a residual plot. """
from matplotlib import pylab as pl
distros = self.get_topn(topn)
n = len(data)
xx = numpy.linspace(0, 1, n + 2)[1:-1]
for dd in distros:
pl.plot(xx, dd.residuals(data), label='%10.2f%% %s' % (100.0*dd.rss/dd.dss, dd.name))
pl.grid(True)
def plot(self, data, topn):
""" Plot data fit and residuals """
from matplotlib import pylab as pl
pl.axes([0.1, 0.4, 0.8, 0.4]) # leave room above the axes for the table
self.fit_plot(data, topn=topn)
pl.axes([0.1, 0.05, 0.8, 0.3])
self.residual_plot(data, topn=topn)
def read_data(infile, field):
""" Simple utility to extract a field out of a csv file. """
import csv
reader = csv.reader(infile)
header = reader.next()
field = header.index(field)
data = []
for row in reader:
data.append(float(row[field]))
return data
if __name__ == '__main__':
import sys
import optparse
from matplotlib import pylab as pl
parser = optparse.OptionParser()
parser.add_option('-d', '--distro', action='append', default=[])
parser.add_option('-l', '--list', action='store_true',
help='List available distros')
parser.add_option('-i', '--infile')
parser.add_option('-f', '--field', default='P/L')
parser.add_option('-n', '--topn', type='int', default=0)
parser.add_option('-s', '--sample', default='normal',
help='generate a sample from this distro as a test')
parser.add_option('--size', type='int', default=1000,
help='Size of sample to generate')
opts, args = parser.parse_args()
if opts.list:
for name, distro in get_continuous_distros():
print name
sys.exit()
opts.distro = ['weibull_min', 'norm']
if not opts.distro:
opts.distro = [x[0] for x in get_continuous_distros()]
quickfit = QuickFit()
for distro in opts.distro:
quickfit.add(distro)
if opts.sample:
data = getattr(numpy.random, opts.sample)(size=opts.size)
else:
data = numpy.array(read_data(open(opts.infile), opts.field))
data.sort()
quickfit.fit(data)
print 'doing stats'
quickfit.stats(data)
print 'doing plot'
quickfit.plot(data, topn=opts.topn)
pl.show()

@ -1,47 +0,0 @@
from __future__ import division, print_function, absolute_import
import numpy as np
import scipy.stats
from scipy.special import i0
def von_mises_cdf_series(k,x,p):
x = float(x)
s = np.sin(x)
c = np.cos(x)
sn = np.sin(p*x)
cn = np.cos(p*x)
R = 0
V = 0
for n in range(p-1,0,-1):
sn, cn = sn*c - cn*s, cn*c + sn*s
R = 1./(2*n/k + R)
V = R*(sn/n+V)
return 0.5+x/(2*np.pi) + V/np.pi
def von_mises_cdf_normalapprox(k,x,C1):
b = np.sqrt(2/np.pi)*np.exp(k)/i0(k)
z = b*np.sin(x/2.)
return scipy.stats.norm.cdf(z)
def von_mises_cdf(k,x):
ix = 2*np.pi*np.round(x/(2*np.pi))
x = x-ix
k = float(k)
# These values should give 12 decimal digits
CK = 50
a = [28., 0.5, 100., 5.0]
C1 = 50.1
if k < CK:
p = int(np.ceil(a[0]+a[1]*k-a[2]/(k+a[3])))
F = np.clip(von_mises_cdf_series(k,x,p),0,1)
else:
F = von_mises_cdf_normalapprox(k,x,C1)
return F+ix

@ -1,76 +0,0 @@
import numpy as np
import scipy.stats
from scipy.special import i0
import numpy.testing
cimport numpy as np
cdef extern from "math.h":
double cos(double theta)
double sin(double theta)
cdef double von_mises_cdf_series(double k,double x,unsigned int p):
cdef double s, c, sn, cn, R, V
cdef unsigned int n
s = sin(x)
c = cos(x)
sn = sin(p*x)
cn = cos(p*x)
R = 0
V = 0
for n in range(p-1,0,-1):
sn, cn = sn*c - cn*s, cn*c + sn*s
R = 1./(2*n/k + R)
V = R*(sn/n+V)
return 0.5+x/(2*np.pi) + V/np.pi
def von_mises_cdf_normalapprox(k,x,C1):
b = np.sqrt(2/np.pi)*np.exp(k)/i0(k)
z = b*np.sin(x/2.)
C = 24*k
chi = z - z**3/((C-2*z**2-16)/3.-(z**4+7/4.*z**2+167./2)/(C+C1-z**2+3))**2
return scipy.stats.norm.cdf(z)
cimport cython
@cython.boundscheck(False)
def von_mises_cdf(k,x):
cdef np.ndarray[double, ndim=1] temp, temp_xs, temp_ks
cdef unsigned int i, p
cdef double a1, a2, a3, a4, C1, CK
#k,x = np.broadcast_arrays(np.asarray(k),np.asarray(x))
k = np.asarray(k)
x = np.asarray(x)
zerodim = k.ndim==0 and x.ndim==0
k = np.atleast_1d(k)
x = np.atleast_1d(x)
ix = np.round(x/(2*np.pi))
x = x-ix*2*np.pi
# These values should give 12 decimal digits
CK=50
a1, a2, a3, a4 = [28., 0.5, 100., 5.0]
C1 = 50.1
bx, bk = np.broadcast_arrays(x,k)
result = np.empty(bx.shape,dtype=np.float)
c_small_k = bk<CK
temp = result[c_small_k]
temp_xs = bx[c_small_k].astype(np.float)
temp_ks = bk[c_small_k].astype(np.float)
for i in range(len(temp)):
p = <int>(1+a1+a2*temp_ks[i]-a3/(temp_ks[i]+a4))
temp[i] = von_mises_cdf_series(temp_ks[i],temp_xs[i],p)
if temp[i]<0:
temp[i]=0
elif temp[i]>1:
temp[i]=1
result[c_small_k] = temp
result[~c_small_k] = von_mises_cdf_normalapprox(bk[~c_small_k],bx[~c_small_k],C1)
if not zerodim:
return result+ix
else:
return (result+ix)[0]
Loading…
Cancel
Save