Simplified wafo.stats:
-Deleted obsolete files. -Requires scipy v0.16 -._distn_infrastructure.py monkeypatch scipy.stats._distn_infrastructure.pymaster
parent
7403d821df
commit
e73624161c
@ -1,408 +0,0 @@
|
|||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from scipy._lib.six import callable
|
|
||||||
|
|
||||||
|
|
||||||
def binned_statistic(x, values, statistic='mean',
|
|
||||||
bins=10, range=None):
|
|
||||||
"""
|
|
||||||
Compute a binned statistic for a set of data.
|
|
||||||
|
|
||||||
This is a generalization of a histogram function. A histogram divides
|
|
||||||
the space into bins, and returns the count of the number of points in
|
|
||||||
each bin. This function allows the computation of the sum, mean, median,
|
|
||||||
or other statistic of the values within each bin.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : array_like
|
|
||||||
A sequence of values to be binned.
|
|
||||||
values : array_like
|
|
||||||
The values on which the statistic will be computed. This must be
|
|
||||||
the same shape as `x`.
|
|
||||||
statistic : string or callable, optional
|
|
||||||
The statistic to compute (default is 'mean').
|
|
||||||
The following statistics are available:
|
|
||||||
|
|
||||||
* 'mean' : compute the mean of values for points within each bin.
|
|
||||||
Empty bins will be represented by NaN.
|
|
||||||
* 'median' : compute the median of values for points within each
|
|
||||||
bin. Empty bins will be represented by NaN.
|
|
||||||
* 'count' : compute the count of points within each bin. This is
|
|
||||||
identical to an unweighted histogram. `values` array is not
|
|
||||||
referenced.
|
|
||||||
* 'sum' : compute the sum of values for points within each bin.
|
|
||||||
This is identical to a weighted histogram.
|
|
||||||
* function : a user-defined function which takes a 1D array of
|
|
||||||
values, and outputs a single numerical statistic. This function
|
|
||||||
will be called on the values in each bin. Empty bins will be
|
|
||||||
represented by function([]), or NaN if this returns an error.
|
|
||||||
|
|
||||||
bins : int or sequence of scalars, optional
|
|
||||||
If `bins` is an int, it defines the number of equal-width
|
|
||||||
bins in the given range (10, by default). If `bins` is a sequence,
|
|
||||||
it defines the bin edges, including the rightmost edge, allowing
|
|
||||||
for non-uniform bin widths.
|
|
||||||
range : (float, float) or [(float, float)], optional
|
|
||||||
The lower and upper range of the bins. If not provided, range
|
|
||||||
is simply ``(x.min(), x.max())``. Values outside the range are
|
|
||||||
ignored.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
statistic : array
|
|
||||||
The values of the selected statistic in each bin.
|
|
||||||
bin_edges : array of dtype float
|
|
||||||
Return the bin edges ``(length(statistic)+1)``.
|
|
||||||
binnumber : 1-D ndarray of ints
|
|
||||||
This assigns to each observation an integer that represents the bin
|
|
||||||
in which this observation falls. Array has the same length as values.
|
|
||||||
|
|
||||||
See Also
|
|
||||||
--------
|
|
||||||
numpy.histogram, binned_statistic_2d, binned_statistic_dd
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
All but the last (righthand-most) bin is half-open. In other words, if
|
|
||||||
`bins` is::
|
|
||||||
|
|
||||||
[1, 2, 3, 4]
|
|
||||||
|
|
||||||
then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the
|
|
||||||
second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes*
|
|
||||||
4.
|
|
||||||
|
|
||||||
.. versionadded:: 0.11.0
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
>>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
|
|
||||||
... bins=3)
|
|
||||||
(array([ 1., 2., 4.]), array([ 1., 2., 3., 4.]), array([1, 2, 1, 2, 3]))
|
|
||||||
|
|
||||||
>>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean', bins=3)
|
|
||||||
(array([ 1., 2., 4.]), array([ 1., 2., 3., 4.]), array([1, 2, 1, 2, 3]))
|
|
||||||
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
N = len(bins)
|
|
||||||
except TypeError:
|
|
||||||
N = 1
|
|
||||||
|
|
||||||
if N != 1:
|
|
||||||
bins = [np.asarray(bins, float)]
|
|
||||||
|
|
||||||
if range is not None:
|
|
||||||
if len(range) == 2:
|
|
||||||
range = [range]
|
|
||||||
|
|
||||||
medians, edges, xy = binned_statistic_dd([x], values, statistic,
|
|
||||||
bins, range)
|
|
||||||
|
|
||||||
return medians, edges[0], xy
|
|
||||||
|
|
||||||
|
|
||||||
def binned_statistic_2d(x, y, values, statistic='mean',
|
|
||||||
bins=10, range=None):
|
|
||||||
"""
|
|
||||||
Compute a bidimensional binned statistic for a set of data.
|
|
||||||
|
|
||||||
This is a generalization of a histogram2d function. A histogram divides
|
|
||||||
the space into bins, and returns the count of the number of points in
|
|
||||||
each bin. This function allows the computation of the sum, mean, median,
|
|
||||||
or other statistic of the values within each bin.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : (N,) array_like
|
|
||||||
A sequence of values to be binned along the first dimension.
|
|
||||||
y : (M,) array_like
|
|
||||||
A sequence of values to be binned along the second dimension.
|
|
||||||
values : (N,) array_like
|
|
||||||
The values on which the statistic will be computed. This must be
|
|
||||||
the same shape as `x`.
|
|
||||||
statistic : string or callable, optional
|
|
||||||
The statistic to compute (default is 'mean').
|
|
||||||
The following statistics are available:
|
|
||||||
|
|
||||||
* 'mean' : compute the mean of values for points within each bin.
|
|
||||||
Empty bins will be represented by NaN.
|
|
||||||
* 'median' : compute the median of values for points within each
|
|
||||||
bin. Empty bins will be represented by NaN.
|
|
||||||
* 'count' : compute the count of points within each bin. This is
|
|
||||||
identical to an unweighted histogram. `values` array is not
|
|
||||||
referenced.
|
|
||||||
* 'sum' : compute the sum of values for points within each bin.
|
|
||||||
This is identical to a weighted histogram.
|
|
||||||
* function : a user-defined function which takes a 1D array of
|
|
||||||
values, and outputs a single numerical statistic. This function
|
|
||||||
will be called on the values in each bin. Empty bins will be
|
|
||||||
represented by function([]), or NaN if this returns an error.
|
|
||||||
|
|
||||||
bins : int or [int, int] or array-like or [array, array], optional
|
|
||||||
The bin specification:
|
|
||||||
|
|
||||||
* the number of bins for the two dimensions (nx=ny=bins),
|
|
||||||
* the number of bins in each dimension (nx, ny = bins),
|
|
||||||
* the bin edges for the two dimensions (x_edges = y_edges = bins),
|
|
||||||
* the bin edges in each dimension (x_edges, y_edges = bins).
|
|
||||||
|
|
||||||
range : (2,2) array_like, optional
|
|
||||||
The leftmost and rightmost edges of the bins along each dimension
|
|
||||||
(if not specified explicitly in the `bins` parameters):
|
|
||||||
[[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
|
|
||||||
considered outliers and not tallied in the histogram.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
statistic : (nx, ny) ndarray
|
|
||||||
The values of the selected statistic in each two-dimensional bin
|
|
||||||
xedges : (nx + 1) ndarray
|
|
||||||
The bin edges along the first dimension.
|
|
||||||
yedges : (ny + 1) ndarray
|
|
||||||
The bin edges along the second dimension.
|
|
||||||
binnumber : 1-D ndarray of ints
|
|
||||||
This assigns to each observation an integer that represents the bin
|
|
||||||
in which this observation falls. Array has the same length as `values`.
|
|
||||||
|
|
||||||
See Also
|
|
||||||
--------
|
|
||||||
numpy.histogram2d, binned_statistic, binned_statistic_dd
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
|
|
||||||
.. versionadded:: 0.11.0
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
# This code is based on np.histogram2d
|
|
||||||
try:
|
|
||||||
N = len(bins)
|
|
||||||
except TypeError:
|
|
||||||
N = 1
|
|
||||||
|
|
||||||
if N != 1 and N != 2:
|
|
||||||
xedges = yedges = np.asarray(bins, float)
|
|
||||||
bins = [xedges, yedges]
|
|
||||||
|
|
||||||
medians, edges, xy = binned_statistic_dd([x, y], values, statistic,
|
|
||||||
bins, range)
|
|
||||||
|
|
||||||
return medians, edges[0], edges[1], xy
|
|
||||||
|
|
||||||
|
|
||||||
def binned_statistic_dd(sample, values, statistic='mean',
|
|
||||||
bins=10, range=None):
|
|
||||||
"""
|
|
||||||
Compute a multidimensional binned statistic for a set of data.
|
|
||||||
|
|
||||||
This is a generalization of a histogramdd function. A histogram divides
|
|
||||||
the space into bins, and returns the count of the number of points in
|
|
||||||
each bin. This function allows the computation of the sum, mean, median,
|
|
||||||
or other statistic of the values within each bin.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
sample : array_like
|
|
||||||
Data to histogram passed as a sequence of D arrays of length N, or
|
|
||||||
as an (N,D) array.
|
|
||||||
values : array_like
|
|
||||||
The values on which the statistic will be computed. This must be
|
|
||||||
the same shape as x.
|
|
||||||
statistic : string or callable, optional
|
|
||||||
The statistic to compute (default is 'mean').
|
|
||||||
The following statistics are available:
|
|
||||||
|
|
||||||
* 'mean' : compute the mean of values for points within each bin.
|
|
||||||
Empty bins will be represented by NaN.
|
|
||||||
* 'median' : compute the median of values for points within each
|
|
||||||
bin. Empty bins will be represented by NaN.
|
|
||||||
* 'count' : compute the count of points within each bin. This is
|
|
||||||
identical to an unweighted histogram. `values` array is not
|
|
||||||
referenced.
|
|
||||||
* 'sum' : compute the sum of values for points within each bin.
|
|
||||||
This is identical to a weighted histogram.
|
|
||||||
* function : a user-defined function which takes a 1D array of
|
|
||||||
values, and outputs a single numerical statistic. This function
|
|
||||||
will be called on the values in each bin. Empty bins will be
|
|
||||||
represented by function([]), or NaN if this returns an error.
|
|
||||||
|
|
||||||
bins : sequence or int, optional
|
|
||||||
The bin specification:
|
|
||||||
|
|
||||||
* A sequence of arrays describing the bin edges along each dimension.
|
|
||||||
* The number of bins for each dimension (nx, ny, ... =bins)
|
|
||||||
* The number of bins for all dimensions (nx=ny=...=bins).
|
|
||||||
|
|
||||||
range : sequence, optional
|
|
||||||
A sequence of lower and upper bin edges to be used if the edges are
|
|
||||||
not given explicitely in `bins`. Defaults to the minimum and maximum
|
|
||||||
values along each dimension.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
statistic : ndarray, shape(nx1, nx2, nx3,...)
|
|
||||||
The values of the selected statistic in each two-dimensional bin
|
|
||||||
edges : list of ndarrays
|
|
||||||
A list of D arrays describing the (nxi + 1) bin edges for each
|
|
||||||
dimension
|
|
||||||
binnumber : 1-D ndarray of ints
|
|
||||||
This assigns to each observation an integer that represents the bin
|
|
||||||
in which this observation falls. Array has the same length as values.
|
|
||||||
|
|
||||||
See Also
|
|
||||||
--------
|
|
||||||
np.histogramdd, binned_statistic, binned_statistic_2d
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
|
|
||||||
.. versionadded:: 0.11.0
|
|
||||||
|
|
||||||
"""
|
|
||||||
if type(statistic) == str:
|
|
||||||
if statistic not in ['mean', 'median', 'count', 'sum', 'std']:
|
|
||||||
raise ValueError('unrecognized statistic "%s"' % statistic)
|
|
||||||
elif callable(statistic):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
raise ValueError("statistic not understood")
|
|
||||||
|
|
||||||
# This code is based on np.histogramdd
|
|
||||||
try:
|
|
||||||
# Sample is an ND-array.
|
|
||||||
N, D = sample.shape
|
|
||||||
except (AttributeError, ValueError):
|
|
||||||
# Sample is a sequence of 1D arrays.
|
|
||||||
sample = np.atleast_2d(sample).T
|
|
||||||
N, D = sample.shape
|
|
||||||
|
|
||||||
nbin = np.empty(D, int)
|
|
||||||
edges = D * [None]
|
|
||||||
dedges = D * [None]
|
|
||||||
|
|
||||||
try:
|
|
||||||
M = len(bins)
|
|
||||||
if M != D:
|
|
||||||
raise AttributeError('The dimension of bins must be equal '
|
|
||||||
'to the dimension of the sample x.')
|
|
||||||
except TypeError:
|
|
||||||
bins = D * [bins]
|
|
||||||
|
|
||||||
# Select range for each dimension
|
|
||||||
# Used only if number of bins is given.
|
|
||||||
if range is None:
|
|
||||||
smin = np.atleast_1d(np.array(sample.min(0), float))
|
|
||||||
smax = np.atleast_1d(np.array(sample.max(0), float))
|
|
||||||
else:
|
|
||||||
smin = np.zeros(D)
|
|
||||||
smax = np.zeros(D)
|
|
||||||
for i in np.arange(D):
|
|
||||||
smin[i], smax[i] = range[i]
|
|
||||||
|
|
||||||
# Make sure the bins have a finite width.
|
|
||||||
for i in np.arange(len(smin)):
|
|
||||||
if smin[i] == smax[i]:
|
|
||||||
smin[i] = smin[i] - .5
|
|
||||||
smax[i] = smax[i] + .5
|
|
||||||
|
|
||||||
# Create edge arrays
|
|
||||||
for i in np.arange(D):
|
|
||||||
if np.isscalar(bins[i]):
|
|
||||||
nbin[i] = bins[i] + 2 # +2 for outlier bins
|
|
||||||
edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1)
|
|
||||||
else:
|
|
||||||
edges[i] = np.asarray(bins[i], float)
|
|
||||||
nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
|
|
||||||
dedges[i] = np.diff(edges[i])
|
|
||||||
|
|
||||||
nbin = np.asarray(nbin)
|
|
||||||
|
|
||||||
# Compute the bin number each sample falls into.
|
|
||||||
Ncount = {}
|
|
||||||
for i in np.arange(D):
|
|
||||||
Ncount[i] = np.digitize(sample[:, i], edges[i])
|
|
||||||
|
|
||||||
# Using digitize, values that fall on an edge are put in the right bin.
|
|
||||||
# For the rightmost bin, we want values equal to the right
|
|
||||||
# edge to be counted in the last bin, and not as an outlier.
|
|
||||||
for i in np.arange(D):
|
|
||||||
# Rounding precision
|
|
||||||
decimal = int(-np.log10(dedges[i].min())) + 6
|
|
||||||
# Find which points are on the rightmost edge.
|
|
||||||
on_edge = np.where(np.around(sample[:, i], decimal)
|
|
||||||
== np.around(edges[i][-1], decimal))[0]
|
|
||||||
# Shift these points one bin to the left.
|
|
||||||
Ncount[i][on_edge] -= 1
|
|
||||||
|
|
||||||
# Compute the sample indices in the flattened statistic matrix.
|
|
||||||
ni = nbin.argsort()
|
|
||||||
xy = np.zeros(N, int)
|
|
||||||
for i in np.arange(0, D - 1):
|
|
||||||
xy += Ncount[ni[i]] * nbin[ni[i + 1:]].prod()
|
|
||||||
xy += Ncount[ni[-1]]
|
|
||||||
|
|
||||||
result = np.empty(nbin.prod(), float)
|
|
||||||
|
|
||||||
if statistic == 'mean':
|
|
||||||
result.fill(np.nan)
|
|
||||||
flatcount = np.bincount(xy, None)
|
|
||||||
flatsum = np.bincount(xy, values)
|
|
||||||
a = flatcount.nonzero()
|
|
||||||
result[a] = flatsum[a] / flatcount[a]
|
|
||||||
elif statistic == 'std':
|
|
||||||
result.fill(0)
|
|
||||||
flatcount = np.bincount(xy, None)
|
|
||||||
flatsum = np.bincount(xy, values)
|
|
||||||
flatsum2 = np.bincount(xy, values ** 2)
|
|
||||||
a = flatcount.nonzero()
|
|
||||||
result[a] = np.sqrt(flatsum2[a] / flatcount[a]
|
|
||||||
- (flatsum[a] / flatcount[a]) ** 2)
|
|
||||||
elif statistic == 'count':
|
|
||||||
result.fill(0)
|
|
||||||
flatcount = np.bincount(xy, None)
|
|
||||||
a = np.arange(len(flatcount))
|
|
||||||
result[a] = flatcount
|
|
||||||
elif statistic == 'sum':
|
|
||||||
result.fill(0)
|
|
||||||
flatsum = np.bincount(xy, values)
|
|
||||||
a = np.arange(len(flatsum))
|
|
||||||
result[a] = flatsum
|
|
||||||
elif statistic == 'median':
|
|
||||||
result.fill(np.nan)
|
|
||||||
for i in np.unique(xy):
|
|
||||||
result[i] = np.median(values[xy == i])
|
|
||||||
elif callable(statistic):
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
# Numpy generates a warnings for mean/std/... with empty list
|
|
||||||
warnings.filterwarnings('ignore', category=RuntimeWarning)
|
|
||||||
old = np.seterr(invalid='ignore')
|
|
||||||
try:
|
|
||||||
null = statistic([])
|
|
||||||
except:
|
|
||||||
null = np.nan
|
|
||||||
np.seterr(**old)
|
|
||||||
result.fill(null)
|
|
||||||
for i in np.unique(xy):
|
|
||||||
result[i] = statistic(values[xy == i])
|
|
||||||
|
|
||||||
# Shape into a proper matrix
|
|
||||||
result = result.reshape(np.sort(nbin))
|
|
||||||
for i in np.arange(nbin.size):
|
|
||||||
j = ni.argsort()[i]
|
|
||||||
result = result.swapaxes(i, j)
|
|
||||||
ni[i], ni[j] = ni[j], ni[i]
|
|
||||||
|
|
||||||
# Remove outliers (indices 0 and -1 for each dimension).
|
|
||||||
core = D * [slice(1, -1)]
|
|
||||||
result = result[core]
|
|
||||||
|
|
||||||
if (result.shape != nbin - 2).any():
|
|
||||||
raise RuntimeError('Internal Shape Error')
|
|
||||||
|
|
||||||
return result, edges, xy
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,884 +0,0 @@
|
|||||||
#
|
|
||||||
# Author: Joris Vankerschaver 2013
|
|
||||||
#
|
|
||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import scipy.linalg
|
|
||||||
from scipy.misc import doccer
|
|
||||||
from scipy.special import gammaln
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['multivariate_normal', 'dirichlet']
|
|
||||||
|
|
||||||
_LOG_2PI = np.log(2 * np.pi)
|
|
||||||
|
|
||||||
|
|
||||||
def _process_parameters(dim, mean, cov):
|
|
||||||
"""
|
|
||||||
Infer dimensionality from mean or covariance matrix, ensure that
|
|
||||||
mean and covariance are full vector resp. matrix.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Try to infer dimensionality
|
|
||||||
if dim is None:
|
|
||||||
if mean is None:
|
|
||||||
if cov is None:
|
|
||||||
dim = 1
|
|
||||||
else:
|
|
||||||
cov = np.asarray(cov, dtype=float)
|
|
||||||
if cov.ndim < 2:
|
|
||||||
dim = 1
|
|
||||||
else:
|
|
||||||
dim = cov.shape[0]
|
|
||||||
else:
|
|
||||||
mean = np.asarray(mean, dtype=float)
|
|
||||||
dim = mean.size
|
|
||||||
else:
|
|
||||||
if not np.isscalar(dim):
|
|
||||||
raise ValueError("Dimension of random variable must be a scalar.")
|
|
||||||
|
|
||||||
# Check input sizes and return full arrays for mean and cov if necessary
|
|
||||||
if mean is None:
|
|
||||||
mean = np.zeros(dim)
|
|
||||||
mean = np.asarray(mean, dtype=float)
|
|
||||||
|
|
||||||
if cov is None:
|
|
||||||
cov = 1.0
|
|
||||||
cov = np.asarray(cov, dtype=float)
|
|
||||||
|
|
||||||
if dim == 1:
|
|
||||||
mean.shape = (1,)
|
|
||||||
cov.shape = (1, 1)
|
|
||||||
|
|
||||||
if mean.ndim != 1 or mean.shape[0] != dim:
|
|
||||||
raise ValueError("Array 'mean' must be a vector of length %d." % dim)
|
|
||||||
if cov.ndim == 0:
|
|
||||||
cov = cov * np.eye(dim)
|
|
||||||
elif cov.ndim == 1:
|
|
||||||
cov = np.diag(cov)
|
|
||||||
elif cov.ndim == 2 and cov.shape != (dim, dim):
|
|
||||||
rows, cols = cov.shape
|
|
||||||
if rows != cols:
|
|
||||||
msg = ("Array 'cov' must be square if it is two dimensional,"
|
|
||||||
" but cov.shape = %s." % str(cov.shape))
|
|
||||||
else:
|
|
||||||
msg = ("Dimension mismatch: array 'cov' is of shape %s,"
|
|
||||||
" but 'mean' is a vector of length %d.")
|
|
||||||
msg = msg % (str(cov.shape), len(mean))
|
|
||||||
raise ValueError(msg)
|
|
||||||
elif cov.ndim > 2:
|
|
||||||
raise ValueError("Array 'cov' must be at most two-dimensional,"
|
|
||||||
" but cov.ndim = %d" % cov.ndim)
|
|
||||||
|
|
||||||
return dim, mean, cov
|
|
||||||
|
|
||||||
|
|
||||||
def _process_quantiles(x, dim):
|
|
||||||
"""
|
|
||||||
Adjust quantiles array so that last axis labels the components of
|
|
||||||
each data point.
|
|
||||||
|
|
||||||
"""
|
|
||||||
x = np.asarray(x, dtype=float)
|
|
||||||
|
|
||||||
if x.ndim == 0:
|
|
||||||
x = x[np.newaxis]
|
|
||||||
elif x.ndim == 1:
|
|
||||||
if dim == 1:
|
|
||||||
x = x[:, np.newaxis]
|
|
||||||
else:
|
|
||||||
x = x[np.newaxis, :]
|
|
||||||
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
def _squeeze_output(out):
|
|
||||||
"""
|
|
||||||
Remove single-dimensional entries from array and convert to scalar,
|
|
||||||
if necessary.
|
|
||||||
|
|
||||||
"""
|
|
||||||
out = out.squeeze()
|
|
||||||
if out.ndim == 0:
|
|
||||||
out = out[()]
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def _eigvalsh_to_eps(spectrum, cond=None, rcond=None):
|
|
||||||
"""
|
|
||||||
Determine which eigenvalues are "small" given the spectrum.
|
|
||||||
|
|
||||||
This is for compatibility across various linear algebra functions
|
|
||||||
that should agree about whether or not a Hermitian matrix is numerically
|
|
||||||
singular and what is its numerical matrix rank.
|
|
||||||
This is designed to be compatible with scipy.linalg.pinvh.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
spectrum : 1d ndarray
|
|
||||||
Array of eigenvalues of a Hermitian matrix.
|
|
||||||
cond, rcond : float, optional
|
|
||||||
Cutoff for small eigenvalues.
|
|
||||||
Singular values smaller than rcond * largest_eigenvalue are
|
|
||||||
considered zero.
|
|
||||||
If None or -1, suitable machine precision is used.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
eps : float
|
|
||||||
Magnitude cutoff for numerical negligibility.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if rcond is not None:
|
|
||||||
cond = rcond
|
|
||||||
if cond in [None, -1]:
|
|
||||||
t = spectrum.dtype.char.lower()
|
|
||||||
factor = {'f': 1E3, 'd': 1E6}
|
|
||||||
cond = factor[t] * np.finfo(t).eps
|
|
||||||
eps = cond * np.max(abs(spectrum))
|
|
||||||
return eps
|
|
||||||
|
|
||||||
|
|
||||||
def _pinv_1d(v, eps=1e-5):
|
|
||||||
"""
|
|
||||||
A helper function for computing the pseudoinverse.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
v : iterable of numbers
|
|
||||||
This may be thought of as a vector of eigenvalues or singular values.
|
|
||||||
eps : float
|
|
||||||
Values with magnitude no greater than eps are considered negligible.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
v_pinv : 1d float ndarray
|
|
||||||
A vector of pseudo-inverted numbers.
|
|
||||||
|
|
||||||
"""
|
|
||||||
return np.array([0 if abs(x) <= eps else 1/x for x in v], dtype=float)
|
|
||||||
|
|
||||||
|
|
||||||
class _PSD(object):
|
|
||||||
"""
|
|
||||||
Compute coordinated functions of a symmetric positive semidefinite matrix.
|
|
||||||
|
|
||||||
This class addresses two issues. Firstly it allows the pseudoinverse,
|
|
||||||
the logarithm of the pseudo-determinant, and the rank of the matrix
|
|
||||||
to be computed using one call to eigh instead of three.
|
|
||||||
Secondly it allows these functions to be computed in a way
|
|
||||||
that gives mutually compatible results.
|
|
||||||
All of the functions are computed with a common understanding as to
|
|
||||||
which of the eigenvalues are to be considered negligibly small.
|
|
||||||
The functions are designed to coordinate with scipy.linalg.pinvh()
|
|
||||||
but not necessarily with np.linalg.det() or with np.linalg.matrix_rank().
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
M : 2d array-like
|
|
||||||
Symmetric positive semidefinite matrix.
|
|
||||||
cond, rcond : float, optional
|
|
||||||
Cutoff for small eigenvalues.
|
|
||||||
Singular values smaller than rcond * largest_eigenvalue are
|
|
||||||
considered zero.
|
|
||||||
If None or -1, suitable machine precision is used.
|
|
||||||
lower : bool, optional
|
|
||||||
Whether the pertinent array data is taken from the lower
|
|
||||||
or upper triangle of M. (Default: lower)
|
|
||||||
check_finite : bool, optional
|
|
||||||
Whether to check that the input matrices contain only finite
|
|
||||||
numbers. Disabling may give a performance gain, but may result
|
|
||||||
in problems (crashes, non-termination) if the inputs do contain
|
|
||||||
infinities or NaNs.
|
|
||||||
allow_singular : bool, optional
|
|
||||||
Whether to allow a singular matrix. (Default: True)
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
The arguments are similar to those of scipy.linalg.pinvh().
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, M, cond=None, rcond=None, lower=True,
|
|
||||||
check_finite=True, allow_singular=True):
|
|
||||||
# Compute the symmetric eigendecomposition.
|
|
||||||
# Note that eigh takes care of array conversion, chkfinite,
|
|
||||||
# and assertion that the matrix is square.
|
|
||||||
s, u = scipy.linalg.eigh(M, lower=lower, check_finite=check_finite)
|
|
||||||
|
|
||||||
eps = _eigvalsh_to_eps(s, cond, rcond)
|
|
||||||
if np.min(s) < -eps:
|
|
||||||
raise ValueError('the input matrix must be positive semidefinite')
|
|
||||||
d = s[s > eps]
|
|
||||||
if len(d) < len(s) and not allow_singular:
|
|
||||||
raise np.linalg.LinAlgError('singular matrix')
|
|
||||||
s_pinv = _pinv_1d(s, eps)
|
|
||||||
U = np.multiply(u, np.sqrt(s_pinv))
|
|
||||||
|
|
||||||
# Initialize the eagerly precomputed attributes.
|
|
||||||
self.rank = len(d)
|
|
||||||
self.U = U
|
|
||||||
self.log_pdet = np.sum(np.log(d))
|
|
||||||
|
|
||||||
# Initialize an attribute to be lazily computed.
|
|
||||||
self._pinv = None
|
|
||||||
|
|
||||||
@property
|
|
||||||
def pinv(self):
|
|
||||||
if self._pinv is None:
|
|
||||||
self._pinv = np.dot(self.U, self.U.T)
|
|
||||||
return self._pinv
|
|
||||||
|
|
||||||
|
|
||||||
_doc_default_callparams = """\
|
|
||||||
mean : array_like, optional
|
|
||||||
Mean of the distribution (default zero)
|
|
||||||
cov : array_like, optional
|
|
||||||
Covariance matrix of the distribution (default one)
|
|
||||||
allow_singular : bool, optional
|
|
||||||
Whether to allow a singular covariance matrix. (Default: False)
|
|
||||||
"""
|
|
||||||
|
|
||||||
_doc_callparams_note = \
|
|
||||||
"""Setting the parameter `mean` to `None` is equivalent to having `mean`
|
|
||||||
be the zero-vector. The parameter `cov` can be a scalar, in which case
|
|
||||||
the covariance matrix is the identity times that value, a vector of
|
|
||||||
diagonal entries for the covariance matrix, or a two-dimensional
|
|
||||||
array_like.
|
|
||||||
"""
|
|
||||||
|
|
||||||
_doc_frozen_callparams = ""
|
|
||||||
|
|
||||||
_doc_frozen_callparams_note = \
|
|
||||||
"""See class definition for a detailed description of parameters."""
|
|
||||||
|
|
||||||
docdict_params = {
|
|
||||||
'_doc_default_callparams': _doc_default_callparams,
|
|
||||||
'_doc_callparams_note': _doc_callparams_note
|
|
||||||
}
|
|
||||||
|
|
||||||
docdict_noparams = {
|
|
||||||
'_doc_default_callparams': _doc_frozen_callparams,
|
|
||||||
'_doc_callparams_note': _doc_frozen_callparams_note
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class multivariate_normal_gen(object):
|
|
||||||
r"""
|
|
||||||
A multivariate normal random variable.
|
|
||||||
|
|
||||||
The `mean` keyword specifies the mean. The `cov` keyword specifies the
|
|
||||||
covariance matrix.
|
|
||||||
|
|
||||||
Methods
|
|
||||||
-------
|
|
||||||
pdf(x, mean=None, cov=1, allow_singular=False)
|
|
||||||
Probability density function.
|
|
||||||
logpdf(x, mean=None, cov=1, allow_singular=False)
|
|
||||||
Log of the probability density function.
|
|
||||||
rvs(mean=None, cov=1, allow_singular=False, size=1)
|
|
||||||
Draw random samples from a multivariate normal distribution.
|
|
||||||
entropy()
|
|
||||||
Compute the differential entropy of the multivariate normal.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : array_like
|
|
||||||
Quantiles, with the last axis of `x` denoting the components.
|
|
||||||
%(_doc_default_callparams)s
|
|
||||||
|
|
||||||
Alternatively, the object may be called (as a function) to fix the mean
|
|
||||||
and covariance parameters, returning a "frozen" multivariate normal
|
|
||||||
random variable:
|
|
||||||
|
|
||||||
rv = multivariate_normal(mean=None, cov=1, allow_singular=False)
|
|
||||||
- Frozen object with the same methods but holding the given
|
|
||||||
mean and covariance fixed.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
%(_doc_callparams_note)s
|
|
||||||
|
|
||||||
The covariance matrix `cov` must be a (symmetric) positive
|
|
||||||
semi-definite matrix. The determinant and inverse of `cov` are computed
|
|
||||||
as the pseudo-determinant and pseudo-inverse, respectively, so
|
|
||||||
that `cov` does not need to have full rank.
|
|
||||||
|
|
||||||
The probability density function for `multivariate_normal` is
|
|
||||||
|
|
||||||
.. math::
|
|
||||||
|
|
||||||
f(x) = \frac{1}{\sqrt{(2 \pi)^k \det \Sigma}} \exp\left( -\frac{1}{2} (x - \mu)^T \Sigma^{-1} (x - \mu) \right),
|
|
||||||
|
|
||||||
where :math:`\mu` is the mean, :math:`\Sigma` the covariance matrix,
|
|
||||||
and :math:`k` is the dimension of the space where :math:`x` takes values.
|
|
||||||
|
|
||||||
.. versionadded:: 0.14.0
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
>>> import matplotlib.pyplot as plt
|
|
||||||
>>> from scipy.stats import multivariate_normal
|
|
||||||
>>> x = np.linspace(0, 5, 10, endpoint=False)
|
|
||||||
>>> y = multivariate_normal.pdf(x, mean=2.5, cov=0.5); y
|
|
||||||
array([ 0.00108914, 0.01033349, 0.05946514, 0.20755375, 0.43939129,
|
|
||||||
0.56418958, 0.43939129, 0.20755375, 0.05946514, 0.01033349])
|
|
||||||
>>> plt.plot(x, y)
|
|
||||||
|
|
||||||
The input quantiles can be any shape of array, as long as the last
|
|
||||||
axis labels the components. This allows us for instance to
|
|
||||||
display the frozen pdf for a non-isotropic random variable in 2D as
|
|
||||||
follows:
|
|
||||||
|
|
||||||
>>> x, y = np.mgrid[-1:1:.01, -1:1:.01]
|
|
||||||
>>> pos = np.empty(x.shape + (2,))
|
|
||||||
>>> pos[:, :, 0] = x; pos[:, :, 1] = y
|
|
||||||
>>> rv = multivariate_normal([0.5, -0.2], [[2.0, 0.3], [0.3, 0.5]])
|
|
||||||
>>> plt.contourf(x, y, rv.pdf(pos))
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.__doc__ = doccer.docformat(self.__doc__, docdict_params)
|
|
||||||
|
|
||||||
def __call__(self, mean=None, cov=1, allow_singular=False):
|
|
||||||
"""
|
|
||||||
Create a frozen multivariate normal distribution.
|
|
||||||
|
|
||||||
See `multivariate_normal_frozen` for more information.
|
|
||||||
|
|
||||||
"""
|
|
||||||
return multivariate_normal_frozen(mean, cov,
|
|
||||||
allow_singular=allow_singular)
|
|
||||||
|
|
||||||
def _logpdf(self, x, mean, prec_U, log_det_cov, rank):
|
|
||||||
"""
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : ndarray
|
|
||||||
Points at which to evaluate the log of the probability
|
|
||||||
density function
|
|
||||||
mean : ndarray
|
|
||||||
Mean of the distribution
|
|
||||||
prec_U : ndarray
|
|
||||||
A decomposition such that np.dot(prec_U, prec_U.T)
|
|
||||||
is the precision matrix, i.e. inverse of the covariance matrix.
|
|
||||||
log_det_cov : float
|
|
||||||
Logarithm of the determinant of the covariance matrix
|
|
||||||
rank : int
|
|
||||||
Rank of the covariance matrix.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
As this function does no argument checking, it should not be
|
|
||||||
called directly; use 'logpdf' instead.
|
|
||||||
|
|
||||||
"""
|
|
||||||
dev = x - mean
|
|
||||||
maha = np.sum(np.square(np.dot(dev, prec_U)), axis=-1)
|
|
||||||
return -0.5 * (rank * _LOG_2PI + log_det_cov + maha)
|
|
||||||
|
|
||||||
def logpdf(self, x, mean, cov, allow_singular=False):
|
|
||||||
"""
|
|
||||||
Log of the multivariate normal probability density function.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : array_like
|
|
||||||
Quantiles, with the last axis of `x` denoting the components.
|
|
||||||
%(_doc_default_callparams)s
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
%(_doc_callparams_note)s
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
pdf : ndarray
|
|
||||||
Log of the probability density function evaluated at `x`
|
|
||||||
|
|
||||||
"""
|
|
||||||
dim, mean, cov = _process_parameters(None, mean, cov)
|
|
||||||
x = _process_quantiles(x, dim)
|
|
||||||
psd = _PSD(cov, allow_singular=allow_singular)
|
|
||||||
out = self._logpdf(x, mean, psd.U, psd.log_pdet, psd.rank)
|
|
||||||
return _squeeze_output(out)
|
|
||||||
|
|
||||||
def pdf(self, x, mean, cov, allow_singular=False):
|
|
||||||
"""
|
|
||||||
Multivariate normal probability density function.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : array_like
|
|
||||||
Quantiles, with the last axis of `x` denoting the components.
|
|
||||||
%(_doc_default_callparams)s
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
%(_doc_callparams_note)s
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
pdf : ndarray
|
|
||||||
Probability density function evaluated at `x`
|
|
||||||
|
|
||||||
"""
|
|
||||||
dim, mean, cov = _process_parameters(None, mean, cov)
|
|
||||||
x = _process_quantiles(x, dim)
|
|
||||||
psd = _PSD(cov, allow_singular=allow_singular)
|
|
||||||
out = np.exp(self._logpdf(x, mean, psd.U, psd.log_pdet, psd.rank))
|
|
||||||
return _squeeze_output(out)
|
|
||||||
|
|
||||||
def rvs(self, mean=None, cov=1, size=1):
|
|
||||||
"""
|
|
||||||
Draw random samples from a multivariate normal distribution.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
%(_doc_default_callparams)s
|
|
||||||
size : integer, optional
|
|
||||||
Number of samples to draw (default 1).
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
%(_doc_callparams_note)s
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
rvs : ndarray or scalar
|
|
||||||
Random variates of size (`size`, `N`), where `N` is the
|
|
||||||
dimension of the random variable.
|
|
||||||
|
|
||||||
"""
|
|
||||||
dim, mean, cov = _process_parameters(None, mean, cov)
|
|
||||||
out = np.random.multivariate_normal(mean, cov, size)
|
|
||||||
return _squeeze_output(out)
|
|
||||||
|
|
||||||
def entropy(self, mean=None, cov=1):
|
|
||||||
"""
|
|
||||||
Compute the differential entropy of the multivariate normal.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
%(_doc_default_callparams)s
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
%(_doc_callparams_note)s
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
h : scalar
|
|
||||||
Entropy of the multivariate normal distribution
|
|
||||||
|
|
||||||
"""
|
|
||||||
dim, mean, cov = _process_parameters(None, mean, cov)
|
|
||||||
return 0.5 * np.log(np.linalg.det(2 * np.pi * np.e * cov))
|
|
||||||
|
|
||||||
|
|
||||||
multivariate_normal = multivariate_normal_gen()
|
|
||||||
|
|
||||||
|
|
||||||
class multivariate_normal_frozen(object):
|
|
||||||
def __init__(self, mean=None, cov=1, allow_singular=False):
|
|
||||||
"""
|
|
||||||
Create a frozen multivariate normal distribution.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
mean : array_like, optional
|
|
||||||
Mean of the distribution (default zero)
|
|
||||||
cov : array_like, optional
|
|
||||||
Covariance matrix of the distribution (default one)
|
|
||||||
allow_singular : bool, optional
|
|
||||||
If this flag is True then tolerate a singular
|
|
||||||
covariance matrix (default False).
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
When called with the default parameters, this will create a 1D random
|
|
||||||
variable with mean 0 and covariance 1:
|
|
||||||
|
|
||||||
>>> from scipy.stats import multivariate_normal
|
|
||||||
>>> r = multivariate_normal()
|
|
||||||
>>> r.mean
|
|
||||||
array([ 0.])
|
|
||||||
>>> r.cov
|
|
||||||
array([[1.]])
|
|
||||||
|
|
||||||
"""
|
|
||||||
self.dim, self.mean, self.cov = _process_parameters(None, mean, cov)
|
|
||||||
self.cov_info = _PSD(self.cov, allow_singular=allow_singular)
|
|
||||||
self._mnorm = multivariate_normal_gen()
|
|
||||||
|
|
||||||
def logpdf(self, x):
|
|
||||||
x = _process_quantiles(x, self.dim)
|
|
||||||
out = self._mnorm._logpdf(x, self.mean, self.cov_info.U,
|
|
||||||
self.cov_info.log_pdet, self.cov_info.rank)
|
|
||||||
return _squeeze_output(out)
|
|
||||||
|
|
||||||
def pdf(self, x):
|
|
||||||
return np.exp(self.logpdf(x))
|
|
||||||
|
|
||||||
def rvs(self, size=1):
|
|
||||||
return self._mnorm.rvs(self.mean, self.cov, size)
|
|
||||||
|
|
||||||
def entropy(self):
|
|
||||||
"""
|
|
||||||
Computes the differential entropy of the multivariate normal.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
h : scalar
|
|
||||||
Entropy of the multivariate normal distribution
|
|
||||||
|
|
||||||
"""
|
|
||||||
log_pdet = self.cov_info.log_pdet
|
|
||||||
rank = self.cov_info.rank
|
|
||||||
return 0.5 * (rank * (_LOG_2PI + 1) + log_pdet)
|
|
||||||
|
|
||||||
|
|
||||||
# Set frozen generator docstrings from corresponding docstrings in
|
|
||||||
# multivariate_normal_gen and fill in default strings in class docstrings
|
|
||||||
for name in ['logpdf', 'pdf', 'rvs']:
|
|
||||||
method = multivariate_normal_gen.__dict__[name]
|
|
||||||
method_frozen = multivariate_normal_frozen.__dict__[name]
|
|
||||||
method_frozen.__doc__ = doccer.docformat(method.__doc__, docdict_noparams)
|
|
||||||
method.__doc__ = doccer.docformat(method.__doc__, docdict_params)
|
|
||||||
|
|
||||||
_dirichlet_doc_default_callparams = """\
|
|
||||||
alpha : array_like
|
|
||||||
The concentration parameters. The number of entries determines the
|
|
||||||
dimensionality of the distribution.
|
|
||||||
"""
|
|
||||||
_dirichlet_doc_frozen_callparams = ""
|
|
||||||
|
|
||||||
_dirichlet_doc_frozen_callparams_note = \
|
|
||||||
"""See class definition for a detailed description of parameters."""
|
|
||||||
|
|
||||||
dirichlet_docdict_params = {
|
|
||||||
'_dirichlet_doc_default_callparams': _dirichlet_doc_default_callparams,
|
|
||||||
}
|
|
||||||
|
|
||||||
dirichlet_docdict_noparams = {
|
|
||||||
'_dirichlet_doc_default_callparams': _dirichlet_doc_frozen_callparams,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _dirichlet_check_parameters(alpha):
|
|
||||||
alpha = np.asarray(alpha)
|
|
||||||
if np.min(alpha) <= 0:
|
|
||||||
raise ValueError("All parameters must be greater than 0")
|
|
||||||
elif alpha.ndim != 1:
|
|
||||||
raise ValueError("Parameter vector 'a' must be one dimensional, " +
|
|
||||||
"but a.shape = %s." % str(alpha.shape))
|
|
||||||
return alpha
|
|
||||||
|
|
||||||
|
|
||||||
def _dirichlet_check_input(alpha, x):
|
|
||||||
x = np.asarray(x)
|
|
||||||
|
|
||||||
if x.shape[0] + 1 != alpha.shape[0] and x.shape[0] != alpha.shape[0]:
|
|
||||||
raise ValueError("Vector 'x' must have one entry less then the" +
|
|
||||||
" parameter vector 'a', but alpha.shape = " +
|
|
||||||
"%s and " % alpha.shape +
|
|
||||||
"x.shape = %s." % x.shape)
|
|
||||||
|
|
||||||
if x.shape[0] != alpha.shape[0]:
|
|
||||||
xk = np.array([1 - np.sum(x, 0)])
|
|
||||||
if xk.ndim == 1:
|
|
||||||
x = np.append(x, xk)
|
|
||||||
elif xk.ndim == 2:
|
|
||||||
x = np.vstack((x, xk))
|
|
||||||
else:
|
|
||||||
raise ValueError("The input must be one dimensional or a two "
|
|
||||||
"dimensional matrix containing the entries.")
|
|
||||||
|
|
||||||
if np.min(x) < 0:
|
|
||||||
raise ValueError("Each entry in 'x' must be greater or equal zero.")
|
|
||||||
|
|
||||||
if np.max(x) > 1:
|
|
||||||
raise ValueError("Each entry in 'x' must be smaller or equal one.")
|
|
||||||
|
|
||||||
if (np.abs(np.sum(x, 0) - 1.0) > 10e-10).any():
|
|
||||||
raise ValueError("The input vector 'x' must lie within the normal " +
|
|
||||||
"simplex. but sum(x)=%f." % np.sum(x, 0))
|
|
||||||
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
def _lnB(alpha):
|
|
||||||
r"""
|
|
||||||
Internal helper function to compute the log of the useful quotient
|
|
||||||
|
|
||||||
.. math::
|
|
||||||
B(\alpha) = \frac{\prod_{i=1}{K}\Gamma(\alpha_i)}{\Gamma\left(\sum_{i=1}^{K}\alpha_i\right)}
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
%(_dirichlet_doc_default_callparams)s
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
B : scalar
|
|
||||||
Helper quotient, internal use only
|
|
||||||
|
|
||||||
"""
|
|
||||||
return np.sum(gammaln(alpha)) - gammaln(np.sum(alpha))
|
|
||||||
|
|
||||||
|
|
||||||
class dirichlet_gen(object):
|
|
||||||
r"""
|
|
||||||
A Dirichlet random variable.
|
|
||||||
|
|
||||||
The `alpha` keyword specifies the concentration parameters of the
|
|
||||||
distribution.
|
|
||||||
|
|
||||||
.. versionadded:: 0.15.0
|
|
||||||
|
|
||||||
Methods
|
|
||||||
-------
|
|
||||||
pdf(x, alpha)
|
|
||||||
Probability density function.
|
|
||||||
logpdf(x, alpha)
|
|
||||||
Log of the probability density function.
|
|
||||||
rvs(alpha, size=1)
|
|
||||||
Draw random samples from a Dirichlet distribution.
|
|
||||||
mean(alpha)
|
|
||||||
The mean of the Dirichlet distribution
|
|
||||||
var(alpha)
|
|
||||||
The variance of the Dirichlet distribution
|
|
||||||
entropy(alpha)
|
|
||||||
Compute the differential entropy of the multivariate normal.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : array_like
|
|
||||||
Quantiles, with the last axis of `x` denoting the components.
|
|
||||||
%(_dirichlet_doc_default_callparams)s
|
|
||||||
|
|
||||||
Alternatively, the object may be called (as a function) to fix
|
|
||||||
concentration parameters, returning a "frozen" Dirichlet
|
|
||||||
random variable:
|
|
||||||
|
|
||||||
rv = dirichlet(alpha)
|
|
||||||
- Frozen object with the same methods but holding the given
|
|
||||||
concentration parameters fixed.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
Each :math:`\alpha` entry must be positive. The distribution has only
|
|
||||||
support on the simplex defined by
|
|
||||||
|
|
||||||
.. math::
|
|
||||||
\sum_{i=1}^{K} x_i \le 1
|
|
||||||
|
|
||||||
|
|
||||||
The probability density function for `dirichlet` is
|
|
||||||
|
|
||||||
.. math::
|
|
||||||
|
|
||||||
f(x) = \frac{1}{\mathrm{B}(\boldsymbol\alpha)} \prod_{i=1}^K x_i^{\alpha_i - 1}
|
|
||||||
|
|
||||||
where
|
|
||||||
|
|
||||||
.. math::
|
|
||||||
\mathrm{B}(\boldsymbol\alpha) = \frac{\prod_{i=1}^K \Gamma(\alpha_i)}{\Gamma\bigl(\sum_{i=1}^K \alpha_i\bigr)}
|
|
||||||
|
|
||||||
and :math:`\boldsymbol\alpha=(\alpha_1,\ldots,\alpha_K)`, the
|
|
||||||
concentration parameters and :math:`K` is the dimension of the space
|
|
||||||
where :math:`x` takes values.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.__doc__ = doccer.docformat(self.__doc__, dirichlet_docdict_params)
|
|
||||||
|
|
||||||
def __call__(self, alpha):
|
|
||||||
return dirichlet_frozen(alpha)
|
|
||||||
|
|
||||||
def _logpdf(self, x, alpha):
|
|
||||||
"""
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : ndarray
|
|
||||||
Points at which to evaluate the log of the probability
|
|
||||||
density function
|
|
||||||
%(_dirichlet_doc_default_callparams)s
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
As this function does no argument checking, it should not be
|
|
||||||
called directly; use 'logpdf' instead.
|
|
||||||
|
|
||||||
"""
|
|
||||||
lnB = _lnB(alpha)
|
|
||||||
return - lnB + np.sum((np.log(x.T) * (alpha - 1)).T, 0)
|
|
||||||
|
|
||||||
def logpdf(self, x, alpha):
|
|
||||||
"""
|
|
||||||
Log of the Dirichlet probability density function.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : array_like
|
|
||||||
Quantiles, with the last axis of `x` denoting the components.
|
|
||||||
%(_dirichlet_doc_default_callparams)s
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
pdf : ndarray
|
|
||||||
Log of the probability density function evaluated at `x`
|
|
||||||
"""
|
|
||||||
alpha = _dirichlet_check_parameters(alpha)
|
|
||||||
x = _dirichlet_check_input(alpha, x)
|
|
||||||
|
|
||||||
out = self._logpdf(x, alpha)
|
|
||||||
return _squeeze_output(out)
|
|
||||||
|
|
||||||
def pdf(self, x, alpha):
|
|
||||||
"""
|
|
||||||
The Dirichlet probability density function.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : array_like
|
|
||||||
Quantiles, with the last axis of `x` denoting the components.
|
|
||||||
%(_dirichlet_doc_default_callparams)s
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
pdf : ndarray
|
|
||||||
The probability density function evaluated at `x`
|
|
||||||
"""
|
|
||||||
alpha = _dirichlet_check_parameters(alpha)
|
|
||||||
x = _dirichlet_check_input(alpha, x)
|
|
||||||
|
|
||||||
out = np.exp(self._logpdf(x, alpha))
|
|
||||||
return _squeeze_output(out)
|
|
||||||
|
|
||||||
def mean(self, alpha):
|
|
||||||
"""
|
|
||||||
Compute the mean of the dirichlet distribution.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
%(_dirichlet_doc_default_callparams)s
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
mu : scalar
|
|
||||||
Mean of the Dirichlet distribution
|
|
||||||
|
|
||||||
"""
|
|
||||||
alpha = _dirichlet_check_parameters(alpha)
|
|
||||||
|
|
||||||
out = alpha / (np.sum(alpha))
|
|
||||||
return _squeeze_output(out)
|
|
||||||
|
|
||||||
def var(self, alpha):
|
|
||||||
"""
|
|
||||||
Compute the variance of the dirichlet distribution.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
%(_dirichlet_doc_default_callparams)s
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
v : scalar
|
|
||||||
Variance of the Dirichlet distribution
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
alpha = _dirichlet_check_parameters(alpha)
|
|
||||||
|
|
||||||
alpha0 = np.sum(alpha)
|
|
||||||
out = (alpha * (alpha0 - alpha)) / ((alpha0 * alpha0) * (alpha0 + 1))
|
|
||||||
return out
|
|
||||||
|
|
||||||
def entropy(self, alpha):
|
|
||||||
"""
|
|
||||||
Compute the differential entropy of the dirichlet distribution.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
%(_dirichlet_doc_default_callparams)s
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
h : scalar
|
|
||||||
Entropy of the Dirichlet distribution
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
alpha = _dirichlet_check_parameters(alpha)
|
|
||||||
|
|
||||||
alpha0 = np.sum(alpha)
|
|
||||||
lnB = _lnB(alpha)
|
|
||||||
K = alpha.shape[0]
|
|
||||||
|
|
||||||
out = lnB + (alpha0 - K) * scipy.special.psi(alpha0) - np.sum(
|
|
||||||
(alpha - 1) * scipy.special.psi(alpha))
|
|
||||||
return _squeeze_output(out)
|
|
||||||
|
|
||||||
def rvs(self, alpha, size=1):
|
|
||||||
"""
|
|
||||||
Draw random samples from a Dirichlet distribution.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
%(_dirichlet_doc_default_callparams)s
|
|
||||||
size : integer, optional
|
|
||||||
Number of samples to draw (default 1).
|
|
||||||
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
rvs : ndarray or scalar
|
|
||||||
Random variates of size (`size`, `N`), where `N` is the
|
|
||||||
dimension of the random variable.
|
|
||||||
|
|
||||||
"""
|
|
||||||
alpha = _dirichlet_check_parameters(alpha)
|
|
||||||
return np.random.dirichlet(alpha, size=size)
|
|
||||||
|
|
||||||
|
|
||||||
dirichlet = dirichlet_gen()
|
|
||||||
|
|
||||||
|
|
||||||
class dirichlet_frozen(object):
|
|
||||||
def __init__(self, alpha):
|
|
||||||
self.alpha = _dirichlet_check_parameters(alpha)
|
|
||||||
self._dirichlet = dirichlet_gen()
|
|
||||||
|
|
||||||
def logpdf(self, x):
|
|
||||||
return self._dirichlet.logpdf(x, self.alpha)
|
|
||||||
|
|
||||||
def pdf(self, x):
|
|
||||||
return self._dirichlet.pdf(x, self.alpha)
|
|
||||||
|
|
||||||
def mean(self):
|
|
||||||
return self._dirichlet.mean(self.alpha)
|
|
||||||
|
|
||||||
def var(self):
|
|
||||||
return self._dirichlet.var(self.alpha)
|
|
||||||
|
|
||||||
def entropy(self):
|
|
||||||
return self._dirichlet.entropy(self.alpha)
|
|
||||||
|
|
||||||
def rvs(self, size=1):
|
|
||||||
return self._dirichlet.rvs(self.alpha, size)
|
|
||||||
|
|
||||||
|
|
||||||
# Set frozen generator docstrings from corresponding docstrings in
|
|
||||||
# multivariate_normal_gen and fill in default strings in class docstrings
|
|
||||||
for name in ['logpdf', 'pdf', 'rvs', 'mean', 'var', 'entropy']:
|
|
||||||
method = dirichlet_gen.__dict__[name]
|
|
||||||
method_frozen = dirichlet_frozen.__dict__[name]
|
|
||||||
method_frozen.__doc__ = doccer.docformat(
|
|
||||||
method.__doc__, dirichlet_docdict_noparams)
|
|
||||||
method.__doc__ = doccer.docformat(method.__doc__, dirichlet_docdict_params)
|
|
@ -1,201 +0,0 @@
|
|||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from numpy import poly1d
|
|
||||||
from scipy.special import beta
|
|
||||||
|
|
||||||
|
|
||||||
# The following code was used to generate the Pade coefficients for the
|
|
||||||
# Tukey Lambda variance function. Version 0.17 of mpmath was used.
|
|
||||||
#---------------------------------------------------------------------------
|
|
||||||
# import mpmath as mp
|
|
||||||
#
|
|
||||||
# mp.mp.dps = 60
|
|
||||||
#
|
|
||||||
# one = mp.mpf(1)
|
|
||||||
# two = mp.mpf(2)
|
|
||||||
#
|
|
||||||
# def mpvar(lam):
|
|
||||||
# if lam == 0:
|
|
||||||
# v = mp.pi**2 / three
|
|
||||||
# else:
|
|
||||||
# v = (two / lam**2) * (one / (one + two*lam) -
|
|
||||||
# mp.beta(lam + one, lam + one))
|
|
||||||
# return v
|
|
||||||
#
|
|
||||||
# t = mp.taylor(mpvar, 0, 8)
|
|
||||||
# p, q = mp.pade(t, 4, 4)
|
|
||||||
# print "p =", [mp.fp.mpf(c) for c in p]
|
|
||||||
# print "q =", [mp.fp.mpf(c) for c in q]
|
|
||||||
#---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# Pade coefficients for the Tukey Lambda variance function.
|
|
||||||
_tukeylambda_var_pc = [3.289868133696453, 0.7306125098871127,
|
|
||||||
-0.5370742306855439, 0.17292046290190008,
|
|
||||||
-0.02371146284628187]
|
|
||||||
_tukeylambda_var_qc = [1.0, 3.683605511659861, 4.184152498888124,
|
|
||||||
1.7660926747377275, 0.2643989311168465]
|
|
||||||
|
|
||||||
# numpy.poly1d instances for the numerator and denominator of the
|
|
||||||
# Pade approximation to the Tukey Lambda variance.
|
|
||||||
_tukeylambda_var_p = poly1d(_tukeylambda_var_pc[::-1])
|
|
||||||
_tukeylambda_var_q = poly1d(_tukeylambda_var_qc[::-1])
|
|
||||||
|
|
||||||
|
|
||||||
def tukeylambda_variance(lam):
|
|
||||||
"""Variance of the Tukey Lambda distribution.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
lam : array_like
|
|
||||||
The lambda values at which to compute the variance.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
v : ndarray
|
|
||||||
The variance. For lam < -0.5, the variance is not defined, so
|
|
||||||
np.nan is returned. For lam = 0.5, np.inf is returned.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
In an interval around lambda=0, this function uses the [4,4] Pade
|
|
||||||
approximation to compute the variance. Otherwise it uses the standard
|
|
||||||
formula (http://en.wikipedia.org/wiki/Tukey_lambda_distribution). The
|
|
||||||
Pade approximation is used because the standard formula has a removable
|
|
||||||
discontinuity at lambda = 0, and does not produce accurate numerical
|
|
||||||
results near lambda = 0.
|
|
||||||
"""
|
|
||||||
lam = np.asarray(lam)
|
|
||||||
shp = lam.shape
|
|
||||||
lam = np.atleast_1d(lam).astype(np.float64)
|
|
||||||
|
|
||||||
# For absolute values of lam less than threshold, use the Pade
|
|
||||||
# approximation.
|
|
||||||
threshold = 0.075
|
|
||||||
|
|
||||||
# Play games with masks to implement the conditional evaluation of
|
|
||||||
# the distribution.
|
|
||||||
# lambda < -0.5: var = nan
|
|
||||||
low_mask = lam < -0.5
|
|
||||||
# lambda == -0.5: var = inf
|
|
||||||
neghalf_mask = lam == -0.5
|
|
||||||
# abs(lambda) < threshold: use Pade approximation
|
|
||||||
small_mask = np.abs(lam) < threshold
|
|
||||||
# else the "regular" case: use the explicit formula.
|
|
||||||
reg_mask = ~(low_mask | neghalf_mask | small_mask)
|
|
||||||
|
|
||||||
# Get the 'lam' values for the cases where they are needed.
|
|
||||||
small = lam[small_mask]
|
|
||||||
reg = lam[reg_mask]
|
|
||||||
|
|
||||||
# Compute the function for each case.
|
|
||||||
v = np.empty_like(lam)
|
|
||||||
v[low_mask] = np.nan
|
|
||||||
v[neghalf_mask] = np.inf
|
|
||||||
if small.size > 0:
|
|
||||||
# Use the Pade approximation near lambda = 0.
|
|
||||||
v[small_mask] = _tukeylambda_var_p(small) / _tukeylambda_var_q(small)
|
|
||||||
if reg.size > 0:
|
|
||||||
v[reg_mask] = (2.0 / reg**2) * (1.0 / (1.0 + 2 * reg) -
|
|
||||||
beta(reg + 1, reg + 1))
|
|
||||||
v.shape = shp
|
|
||||||
return v
|
|
||||||
|
|
||||||
|
|
||||||
# The following code was used to generate the Pade coefficients for the
|
|
||||||
# Tukey Lambda kurtosis function. Version 0.17 of mpmath was used.
|
|
||||||
#---------------------------------------------------------------------------
|
|
||||||
# import mpmath as mp
|
|
||||||
#
|
|
||||||
# mp.mp.dps = 60
|
|
||||||
#
|
|
||||||
# one = mp.mpf(1)
|
|
||||||
# two = mp.mpf(2)
|
|
||||||
# three = mp.mpf(3)
|
|
||||||
# four = mp.mpf(4)
|
|
||||||
#
|
|
||||||
# def mpkurt(lam):
|
|
||||||
# if lam == 0:
|
|
||||||
# k = mp.mpf(6)/5
|
|
||||||
# else:
|
|
||||||
# numer = (one/(four*lam+one) - four*mp.beta(three*lam+one, lam+one) +
|
|
||||||
# three*mp.beta(two*lam+one, two*lam+one))
|
|
||||||
# denom = two*(one/(two*lam+one) - mp.beta(lam+one,lam+one))**2
|
|
||||||
# k = numer / denom - three
|
|
||||||
# return k
|
|
||||||
#
|
|
||||||
# # There is a bug in mpmath 0.17: when we use the 'method' keyword of the
|
|
||||||
# # taylor function and we request a degree 9 Taylor polynomial, we actually
|
|
||||||
# # get degree 8.
|
|
||||||
# t = mp.taylor(mpkurt, 0, 9, method='quad', radius=0.01)
|
|
||||||
# t = [mp.chop(c, tol=1e-15) for c in t]
|
|
||||||
# p, q = mp.pade(t, 4, 4)
|
|
||||||
# print "p =", [mp.fp.mpf(c) for c in p]
|
|
||||||
# print "q =", [mp.fp.mpf(c) for c in q]
|
|
||||||
#---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# Pade coefficients for the Tukey Lambda kurtosis function.
|
|
||||||
_tukeylambda_kurt_pc = [1.2, -5.853465139719495, -22.653447381131077,
|
|
||||||
0.20601184383406815, 4.59796302262789]
|
|
||||||
_tukeylambda_kurt_qc = [1.0, 7.171149192233599, 12.96663094361842,
|
|
||||||
0.43075235247853005, -2.789746758009912]
|
|
||||||
|
|
||||||
# numpy.poly1d instances for the numerator and denominator of the
|
|
||||||
# Pade approximation to the Tukey Lambda kurtosis.
|
|
||||||
_tukeylambda_kurt_p = poly1d(_tukeylambda_kurt_pc[::-1])
|
|
||||||
_tukeylambda_kurt_q = poly1d(_tukeylambda_kurt_qc[::-1])
|
|
||||||
|
|
||||||
|
|
||||||
def tukeylambda_kurtosis(lam):
|
|
||||||
"""Kurtosis of the Tukey Lambda distribution.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
lam : array_like
|
|
||||||
The lambda values at which to compute the variance.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
v : ndarray
|
|
||||||
The variance. For lam < -0.25, the variance is not defined, so
|
|
||||||
np.nan is returned. For lam = 0.25, np.inf is returned.
|
|
||||||
|
|
||||||
"""
|
|
||||||
lam = np.asarray(lam)
|
|
||||||
shp = lam.shape
|
|
||||||
lam = np.atleast_1d(lam).astype(np.float64)
|
|
||||||
|
|
||||||
# For absolute values of lam less than threshold, use the Pade
|
|
||||||
# approximation.
|
|
||||||
threshold = 0.055
|
|
||||||
|
|
||||||
# Use masks to implement the conditional evaluation of the kurtosis.
|
|
||||||
# lambda < -0.25: kurtosis = nan
|
|
||||||
low_mask = lam < -0.25
|
|
||||||
# lambda == -0.25: kurtosis = inf
|
|
||||||
negqrtr_mask = lam == -0.25
|
|
||||||
# lambda near 0: use Pade approximation
|
|
||||||
small_mask = np.abs(lam) < threshold
|
|
||||||
# else the "regular" case: use the explicit formula.
|
|
||||||
reg_mask = ~(low_mask | negqrtr_mask | small_mask)
|
|
||||||
|
|
||||||
# Get the 'lam' values for the cases where they are needed.
|
|
||||||
small = lam[small_mask]
|
|
||||||
reg = lam[reg_mask]
|
|
||||||
|
|
||||||
# Compute the function for each case.
|
|
||||||
k = np.empty_like(lam)
|
|
||||||
k[low_mask] = np.nan
|
|
||||||
k[negqrtr_mask] = np.inf
|
|
||||||
if small.size > 0:
|
|
||||||
k[small_mask] = _tukeylambda_kurt_p(small) / _tukeylambda_kurt_q(small)
|
|
||||||
if reg.size > 0:
|
|
||||||
numer = (1.0 / (4 * reg + 1) - 4 * beta(3 * reg + 1, reg + 1) +
|
|
||||||
3 * beta(2 * reg + 1, 2 * reg + 1))
|
|
||||||
denom = 2 * (1.0/(2 * reg + 1) - beta(reg + 1, reg + 1))**2
|
|
||||||
k[reg_mask] = numer / denom - 3
|
|
||||||
|
|
||||||
# The return value will be a numpy array; resetting the shape ensures that
|
|
||||||
# if `lam` was a scalar, the return value is a 0-d array.
|
|
||||||
k.shape = shp
|
|
||||||
return k
|
|
@ -1,271 +0,0 @@
|
|||||||
"""Some functions for working with contingency tables (i.e. cross tabulations).
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
from functools import reduce
|
|
||||||
import numpy as np
|
|
||||||
from .stats import power_divergence
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['margins', 'expected_freq', 'chi2_contingency']
|
|
||||||
|
|
||||||
|
|
||||||
def margins(a):
|
|
||||||
"""Return a list of the marginal sums of the array `a`.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
a : ndarray
|
|
||||||
The array for which to compute the marginal sums.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
margsums : list of ndarrays
|
|
||||||
A list of length `a.ndim`. `margsums[k]` is the result
|
|
||||||
of summing `a` over all axes except `k`; it has the same
|
|
||||||
number of dimensions as `a`, but the length of each axis
|
|
||||||
except axis `k` will be 1.
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
>>> a = np.arange(12).reshape(2, 6)
|
|
||||||
>>> a
|
|
||||||
array([[ 0, 1, 2, 3, 4, 5],
|
|
||||||
[ 6, 7, 8, 9, 10, 11]])
|
|
||||||
>>> m0, m1 = margins(a)
|
|
||||||
>>> m0
|
|
||||||
array([[15],
|
|
||||||
[51]])
|
|
||||||
>>> m1
|
|
||||||
array([[ 6, 8, 10, 12, 14, 16]])
|
|
||||||
|
|
||||||
>>> b = np.arange(24).reshape(2,3,4)
|
|
||||||
>>> m0, m1, m2 = margins(b)
|
|
||||||
>>> m0
|
|
||||||
array([[[ 66]],
|
|
||||||
[[210]]])
|
|
||||||
>>> m1
|
|
||||||
array([[[ 60],
|
|
||||||
[ 92],
|
|
||||||
[124]]])
|
|
||||||
>>> m2
|
|
||||||
array([[[60, 66, 72, 78]]])
|
|
||||||
"""
|
|
||||||
margsums = []
|
|
||||||
ranged = list(range(a.ndim))
|
|
||||||
for k in ranged:
|
|
||||||
marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
|
|
||||||
margsums.append(marg)
|
|
||||||
return margsums
|
|
||||||
|
|
||||||
|
|
||||||
def expected_freq(observed):
|
|
||||||
"""
|
|
||||||
Compute the expected frequencies from a contingency table.
|
|
||||||
|
|
||||||
Given an n-dimensional contingency table of observed frequencies,
|
|
||||||
compute the expected frequencies for the table based on the marginal
|
|
||||||
sums under the assumption that the groups associated with each
|
|
||||||
dimension are independent.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
observed : array_like
|
|
||||||
The table of observed frequencies. (While this function can handle
|
|
||||||
a 1-D array, that case is trivial. Generally `observed` is at
|
|
||||||
least 2-D.)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
expected : ndarray of float64
|
|
||||||
The expected frequencies, based on the marginal sums of the table.
|
|
||||||
Same shape as `observed`.
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
>>> observed = np.array([[10, 10, 20],[20, 20, 20]])
|
|
||||||
>>> expected_freq(observed)
|
|
||||||
array([[ 12., 12., 16.],
|
|
||||||
[ 18., 18., 24.]])
|
|
||||||
|
|
||||||
"""
|
|
||||||
# Typically `observed` is an integer array. If `observed` has a large
|
|
||||||
# number of dimensions or holds large values, some of the following
|
|
||||||
# computations may overflow, so we first switch to floating point.
|
|
||||||
observed = np.asarray(observed, dtype=np.float64)
|
|
||||||
|
|
||||||
# Create a list of the marginal sums.
|
|
||||||
margsums = margins(observed)
|
|
||||||
|
|
||||||
# Create the array of expected frequencies. The shapes of the
|
|
||||||
# marginal sums returned by apply_over_axes() are just what we
|
|
||||||
# need for broadcasting in the following product.
|
|
||||||
d = observed.ndim
|
|
||||||
expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
|
|
||||||
return expected
|
|
||||||
|
|
||||||
|
|
||||||
def chi2_contingency(observed, correction=True, lambda_=None):
|
|
||||||
"""Chi-square test of independence of variables in a contingency table.
|
|
||||||
|
|
||||||
This function computes the chi-square statistic and p-value for the
|
|
||||||
hypothesis test of independence of the observed frequencies in the
|
|
||||||
contingency table [1]_ `observed`. The expected frequencies are computed
|
|
||||||
based on the marginal sums under the assumption of independence; see
|
|
||||||
`scipy.stats.contingency.expected_freq`. The number of degrees of
|
|
||||||
freedom is (expressed using numpy functions and attributes)::
|
|
||||||
|
|
||||||
dof = observed.size - sum(observed.shape) + observed.ndim - 1
|
|
||||||
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
observed : array_like
|
|
||||||
The contingency table. The table contains the observed frequencies
|
|
||||||
(i.e. number of occurrences) in each category. In the two-dimensional
|
|
||||||
case, the table is often described as an "R x C table".
|
|
||||||
correction : bool, optional
|
|
||||||
If True, *and* the degrees of freedom is 1, apply Yates' correction
|
|
||||||
for continuity. The effect of the correction is to adjust each
|
|
||||||
observed value by 0.5 towards the corresponding expected value.
|
|
||||||
lambda_ : float or str, optional.
|
|
||||||
By default, the statistic computed in this test is Pearson's
|
|
||||||
chi-squared statistic [2]_. `lambda_` allows a statistic from the
|
|
||||||
Cressie-Read power divergence family [3]_ to be used instead. See
|
|
||||||
`power_divergence` for details.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
chi2 : float
|
|
||||||
The test statistic.
|
|
||||||
p : float
|
|
||||||
The p-value of the test
|
|
||||||
dof : int
|
|
||||||
Degrees of freedom
|
|
||||||
expected : ndarray, same shape as `observed`
|
|
||||||
The expected frequencies, based on the marginal sums of the table.
|
|
||||||
|
|
||||||
See Also
|
|
||||||
--------
|
|
||||||
contingency.expected_freq
|
|
||||||
fisher_exact
|
|
||||||
chisquare
|
|
||||||
power_divergence
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
An often quoted guideline for the validity of this calculation is that
|
|
||||||
the test should be used only if the observed and expected frequency in
|
|
||||||
each cell is at least 5.
|
|
||||||
|
|
||||||
This is a test for the independence of different categories of a
|
|
||||||
population. The test is only meaningful when the dimension of
|
|
||||||
`observed` is two or more. Applying the test to a one-dimensional
|
|
||||||
table will always result in `expected` equal to `observed` and a
|
|
||||||
chi-square statistic equal to 0.
|
|
||||||
|
|
||||||
This function does not handle masked arrays, because the calculation
|
|
||||||
does not make sense with missing values.
|
|
||||||
|
|
||||||
Like stats.chisquare, this function computes a chi-square statistic;
|
|
||||||
the convenience this function provides is to figure out the expected
|
|
||||||
frequencies and degrees of freedom from the given contingency table.
|
|
||||||
If these were already known, and if the Yates' correction was not
|
|
||||||
required, one could use stats.chisquare. That is, if one calls::
|
|
||||||
|
|
||||||
chi2, p, dof, ex = chi2_contingency(obs, correction=False)
|
|
||||||
|
|
||||||
then the following is true::
|
|
||||||
|
|
||||||
(chi2, p) == stats.chisquare(obs.ravel(), f_exp=ex.ravel(),
|
|
||||||
ddof=obs.size - 1 - dof)
|
|
||||||
|
|
||||||
The `lambda_` argument was added in version 0.13.0 of scipy.
|
|
||||||
|
|
||||||
References
|
|
||||||
----------
|
|
||||||
.. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
|
|
||||||
.. [2] "Pearson's chi-squared test",
|
|
||||||
http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
|
|
||||||
.. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
|
|
||||||
Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
|
|
||||||
pp. 440-464.
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
A two-way example (2 x 3):
|
|
||||||
|
|
||||||
>>> obs = np.array([[10, 10, 20], [20, 20, 20]])
|
|
||||||
>>> chi2_contingency(obs)
|
|
||||||
(2.7777777777777777,
|
|
||||||
0.24935220877729619,
|
|
||||||
2,
|
|
||||||
array([[ 12., 12., 16.],
|
|
||||||
[ 18., 18., 24.]]))
|
|
||||||
|
|
||||||
Perform the test using the log-likelihood ratio (i.e. the "G-test")
|
|
||||||
instead of Pearson's chi-squared statistic.
|
|
||||||
|
|
||||||
>>> g, p, dof, expctd = chi2_contingency(obs, lambda_="log-likelihood")
|
|
||||||
>>> g, p
|
|
||||||
(2.7688587616781319, 0.25046668010954165)
|
|
||||||
|
|
||||||
A four-way example (2 x 2 x 2 x 2):
|
|
||||||
|
|
||||||
>>> obs = np.array(
|
|
||||||
... [[[[12, 17],
|
|
||||||
... [11, 16]],
|
|
||||||
... [[11, 12],
|
|
||||||
... [15, 16]]],
|
|
||||||
... [[[23, 15],
|
|
||||||
... [30, 22]],
|
|
||||||
... [[14, 17],
|
|
||||||
... [15, 16]]]])
|
|
||||||
>>> chi2_contingency(obs)
|
|
||||||
(8.7584514426741897,
|
|
||||||
0.64417725029295503,
|
|
||||||
11,
|
|
||||||
array([[[[ 14.15462386, 14.15462386],
|
|
||||||
[ 16.49423111, 16.49423111]],
|
|
||||||
[[ 11.2461395 , 11.2461395 ],
|
|
||||||
[ 13.10500554, 13.10500554]]],
|
|
||||||
[[[ 19.5591166 , 19.5591166 ],
|
|
||||||
[ 22.79202844, 22.79202844]],
|
|
||||||
[[ 15.54012004, 15.54012004],
|
|
||||||
[ 18.10873492, 18.10873492]]]]))
|
|
||||||
"""
|
|
||||||
observed = np.asarray(observed)
|
|
||||||
if np.any(observed < 0):
|
|
||||||
raise ValueError("All values in `observed` must be nonnegative.")
|
|
||||||
if observed.size == 0:
|
|
||||||
raise ValueError("No data; `observed` has size 0.")
|
|
||||||
|
|
||||||
expected = expected_freq(observed)
|
|
||||||
if np.any(expected == 0):
|
|
||||||
# Include one of the positions where expected is zero in
|
|
||||||
# the exception message.
|
|
||||||
zeropos = list(zip(*np.where(expected == 0)))[0]
|
|
||||||
raise ValueError("The internally computed table of expected "
|
|
||||||
"frequencies has a zero element at %s." % (zeropos,))
|
|
||||||
|
|
||||||
# The degrees of freedom
|
|
||||||
dof = expected.size - sum(expected.shape) + expected.ndim - 1
|
|
||||||
|
|
||||||
if dof == 0:
|
|
||||||
# Degenerate case; this occurs when `observed` is 1D (or, more
|
|
||||||
# generally, when it has only one nontrivial dimension). In this
|
|
||||||
# case, we also have observed == expected, so chi2 is 0.
|
|
||||||
chi2 = 0.0
|
|
||||||
p = 1.0
|
|
||||||
else:
|
|
||||||
if dof == 1 and correction:
|
|
||||||
# Adjust `observed` according to Yates' correction for continuity.
|
|
||||||
observed = observed + 0.5 * np.sign(expected - observed)
|
|
||||||
|
|
||||||
chi2, p = power_divergence(observed, expected,
|
|
||||||
ddof=observed.size - 1 - dof, axis=None,
|
|
||||||
lambda_=lambda_)
|
|
||||||
|
|
||||||
return chi2, p, dof, expected
|
|
@ -1,541 +0,0 @@
|
|||||||
#-------------------------------------------------------------------------------
|
|
||||||
#
|
|
||||||
# Define classes for (uni/multi)-variate kernel density estimation.
|
|
||||||
#
|
|
||||||
# Currently, only Gaussian kernels are implemented.
|
|
||||||
#
|
|
||||||
# Written by: Robert Kern
|
|
||||||
#
|
|
||||||
# Date: 2004-08-09
|
|
||||||
#
|
|
||||||
# Modified: 2005-02-10 by Robert Kern.
|
|
||||||
# Contributed to Scipy
|
|
||||||
# 2005-10-07 by Robert Kern.
|
|
||||||
# Some fixes to match the new scipy_core
|
|
||||||
#
|
|
||||||
# Copyright 2004-2005 by Enthought, Inc.
|
|
||||||
#
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
# Standard library imports.
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
# Scipy imports.
|
|
||||||
from scipy._lib.six import callable, string_types
|
|
||||||
from scipy import linalg, special
|
|
||||||
|
|
||||||
from numpy import atleast_2d, reshape, zeros, newaxis, dot, exp, pi, sqrt, \
|
|
||||||
ravel, power, atleast_1d, squeeze, sum, transpose
|
|
||||||
import numpy as np
|
|
||||||
from numpy.random import randint, multivariate_normal
|
|
||||||
|
|
||||||
# Local imports.
|
|
||||||
from . import mvn
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['gaussian_kde']
|
|
||||||
|
|
||||||
|
|
||||||
class gaussian_kde(object):
|
|
||||||
"""Representation of a kernel-density estimate using Gaussian kernels.
|
|
||||||
|
|
||||||
Kernel density estimation is a way to estimate the probability density
|
|
||||||
function (PDF) of a random variable in a non-parametric way.
|
|
||||||
`gaussian_kde` works for both uni-variate and multi-variate data. It
|
|
||||||
includes automatic bandwidth determination. The estimation works best for
|
|
||||||
a unimodal distribution; bimodal or multi-modal distributions tend to be
|
|
||||||
oversmoothed.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : array_like
|
|
||||||
Datapoints to estimate from. In case of univariate data this is a 1-D
|
|
||||||
array, otherwise a 2-D array with shape (# of dims, # of data).
|
|
||||||
bw_method : str, scalar or callable, optional
|
|
||||||
The method used to calculate the estimator bandwidth. This can be
|
|
||||||
'scott', 'silverman', a scalar constant or a callable. If a scalar,
|
|
||||||
this will be used directly as `kde.factor`. If a callable, it should
|
|
||||||
take a `gaussian_kde` instance as only parameter and return a scalar.
|
|
||||||
If None (default), 'scott' is used. See Notes for more details.
|
|
||||||
|
|
||||||
Attributes
|
|
||||||
----------
|
|
||||||
dataset : ndarray
|
|
||||||
The dataset with which `gaussian_kde` was initialized.
|
|
||||||
d : int
|
|
||||||
Number of dimensions.
|
|
||||||
n : int
|
|
||||||
Number of datapoints.
|
|
||||||
factor : float
|
|
||||||
The bandwidth factor, obtained from `kde.covariance_factor`, with which
|
|
||||||
the covariance matrix is multiplied.
|
|
||||||
covariance : ndarray
|
|
||||||
The covariance matrix of `dataset`, scaled by the calculated bandwidth
|
|
||||||
(`kde.factor`).
|
|
||||||
inv_cov : ndarray
|
|
||||||
The inverse of `covariance`.
|
|
||||||
|
|
||||||
Methods
|
|
||||||
-------
|
|
||||||
kde.evaluate(points) : ndarray
|
|
||||||
Evaluate the estimated pdf on a provided set of points.
|
|
||||||
kde(points) : ndarray
|
|
||||||
Same as kde.evaluate(points)
|
|
||||||
kde.integrate_gaussian(mean, cov) : float
|
|
||||||
Multiply pdf with a specified Gaussian and integrate over the whole
|
|
||||||
domain.
|
|
||||||
kde.integrate_box_1d(low, high) : float
|
|
||||||
Integrate pdf (1D only) between two bounds.
|
|
||||||
kde.integrate_box(low_bounds, high_bounds) : float
|
|
||||||
Integrate pdf over a rectangular space between low_bounds and
|
|
||||||
high_bounds.
|
|
||||||
kde.integrate_kde(other_kde) : float
|
|
||||||
Integrate two kernel density estimates multiplied together.
|
|
||||||
kde.pdf(points) : ndarray
|
|
||||||
Alias for ``kde.evaluate(points)``.
|
|
||||||
kde.logpdf(points) : ndarray
|
|
||||||
Equivalent to ``np.log(kde.evaluate(points))``.
|
|
||||||
kde.resample(size=None) : ndarray
|
|
||||||
Randomly sample a dataset from the estimated pdf.
|
|
||||||
kde.set_bandwidth(bw_method='scott') : None
|
|
||||||
Computes the bandwidth, i.e. the coefficient that multiplies the data
|
|
||||||
covariance matrix to obtain the kernel covariance matrix.
|
|
||||||
.. versionadded:: 0.11.0
|
|
||||||
kde.covariance_factor : float
|
|
||||||
Computes the coefficient (`kde.factor`) that multiplies the data
|
|
||||||
covariance matrix to obtain the kernel covariance matrix.
|
|
||||||
The default is `scotts_factor`. A subclass can overwrite this method
|
|
||||||
to provide a different method, or set it through a call to
|
|
||||||
`kde.set_bandwidth`.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
Bandwidth selection strongly influences the estimate obtained from the KDE
|
|
||||||
(much more so than the actual shape of the kernel). Bandwidth selection
|
|
||||||
can be done by a "rule of thumb", by cross-validation, by "plug-in
|
|
||||||
methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
|
|
||||||
uses a rule of thumb, the default is Scott's Rule.
|
|
||||||
|
|
||||||
Scott's Rule [1]_, implemented as `scotts_factor`, is::
|
|
||||||
|
|
||||||
n**(-1./(d+4)),
|
|
||||||
|
|
||||||
with ``n`` the number of data points and ``d`` the number of dimensions.
|
|
||||||
Silverman's Rule [2]_, implemented as `silverman_factor`, is::
|
|
||||||
|
|
||||||
(n * (d + 2) / 4.)**(-1. / (d + 4)).
|
|
||||||
|
|
||||||
Good general descriptions of kernel density estimation can be found in [1]_
|
|
||||||
and [2]_, the mathematics for this multi-dimensional implementation can be
|
|
||||||
found in [1]_.
|
|
||||||
|
|
||||||
References
|
|
||||||
----------
|
|
||||||
.. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
|
|
||||||
Visualization", John Wiley & Sons, New York, Chicester, 1992.
|
|
||||||
.. [2] B.W. Silverman, "Density Estimation for Statistics and Data
|
|
||||||
Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
|
|
||||||
Chapman and Hall, London, 1986.
|
|
||||||
.. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
|
|
||||||
Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
|
|
||||||
.. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
|
|
||||||
conditional density estimation", Computational Statistics & Data
|
|
||||||
Analysis, Vol. 36, pp. 279-298, 2001.
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
Generate some random two-dimensional data:
|
|
||||||
|
|
||||||
>>> from scipy import stats
|
|
||||||
>>> def measure(n):
|
|
||||||
>>> "Measurement model, return two coupled measurements."
|
|
||||||
>>> m1 = np.random.normal(size=n)
|
|
||||||
>>> m2 = np.random.normal(scale=0.5, size=n)
|
|
||||||
>>> return m1+m2, m1-m2
|
|
||||||
|
|
||||||
>>> m1, m2 = measure(2000)
|
|
||||||
>>> xmin = m1.min()
|
|
||||||
>>> xmax = m1.max()
|
|
||||||
>>> ymin = m2.min()
|
|
||||||
>>> ymax = m2.max()
|
|
||||||
|
|
||||||
Perform a kernel density estimate on the data:
|
|
||||||
|
|
||||||
>>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
|
|
||||||
>>> positions = np.vstack([X.ravel(), Y.ravel()])
|
|
||||||
>>> values = np.vstack([m1, m2])
|
|
||||||
>>> kernel = stats.gaussian_kde(values)
|
|
||||||
>>> Z = np.reshape(kernel(positions).T, X.shape)
|
|
||||||
|
|
||||||
Plot the results:
|
|
||||||
|
|
||||||
>>> import matplotlib.pyplot as plt
|
|
||||||
>>> fig = plt.figure()
|
|
||||||
>>> ax = fig.add_subplot(111)
|
|
||||||
>>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
|
|
||||||
... extent=[xmin, xmax, ymin, ymax])
|
|
||||||
>>> ax.plot(m1, m2, 'k.', markersize=2)
|
|
||||||
>>> ax.set_xlim([xmin, xmax])
|
|
||||||
>>> ax.set_ylim([ymin, ymax])
|
|
||||||
>>> plt.show()
|
|
||||||
|
|
||||||
"""
|
|
||||||
def __init__(self, dataset, bw_method=None):
|
|
||||||
self.dataset = atleast_2d(dataset)
|
|
||||||
if not self.dataset.size > 1:
|
|
||||||
raise ValueError("`dataset` input should have multiple elements.")
|
|
||||||
|
|
||||||
self.d, self.n = self.dataset.shape
|
|
||||||
self.set_bandwidth(bw_method=bw_method)
|
|
||||||
|
|
||||||
def evaluate(self, points):
|
|
||||||
"""Evaluate the estimated pdf on a set of points.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
points : (# of dimensions, # of points)-array
|
|
||||||
Alternatively, a (# of dimensions,) vector can be passed in and
|
|
||||||
treated as a single point.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
values : (# of points,)-array
|
|
||||||
The values at each point.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
ValueError : if the dimensionality of the input points is different than
|
|
||||||
the dimensionality of the KDE.
|
|
||||||
|
|
||||||
"""
|
|
||||||
points = atleast_2d(points)
|
|
||||||
|
|
||||||
d, m = points.shape
|
|
||||||
if d != self.d:
|
|
||||||
if d == 1 and m == self.d:
|
|
||||||
# points was passed in as a row vector
|
|
||||||
points = reshape(points, (self.d, 1))
|
|
||||||
m = 1
|
|
||||||
else:
|
|
||||||
msg = "points have dimension %s, dataset has dimension %s" % (d,
|
|
||||||
self.d)
|
|
||||||
raise ValueError(msg)
|
|
||||||
|
|
||||||
result = zeros((m,), dtype=np.float)
|
|
||||||
|
|
||||||
if m >= self.n:
|
|
||||||
# there are more points than data, so loop over data
|
|
||||||
for i in range(self.n):
|
|
||||||
diff = self.dataset[:, i, newaxis] - points
|
|
||||||
tdiff = dot(self.inv_cov, diff)
|
|
||||||
energy = sum(diff*tdiff,axis=0) / 2.0
|
|
||||||
result = result + exp(-energy)
|
|
||||||
else:
|
|
||||||
# loop over points
|
|
||||||
for i in range(m):
|
|
||||||
diff = self.dataset - points[:, i, newaxis]
|
|
||||||
tdiff = dot(self.inv_cov, diff)
|
|
||||||
energy = sum(diff * tdiff, axis=0) / 2.0
|
|
||||||
result[i] = sum(exp(-energy), axis=0)
|
|
||||||
|
|
||||||
result = result / self._norm_factor
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
__call__ = evaluate
|
|
||||||
|
|
||||||
def integrate_gaussian(self, mean, cov):
|
|
||||||
"""
|
|
||||||
Multiply estimated density by a multivariate Gaussian and integrate
|
|
||||||
over the whole space.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
mean : aray_like
|
|
||||||
A 1-D array, specifying the mean of the Gaussian.
|
|
||||||
cov : array_like
|
|
||||||
A 2-D array, specifying the covariance matrix of the Gaussian.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
result : scalar
|
|
||||||
The value of the integral.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
ValueError :
|
|
||||||
If the mean or covariance of the input Gaussian differs from
|
|
||||||
the KDE's dimensionality.
|
|
||||||
|
|
||||||
"""
|
|
||||||
mean = atleast_1d(squeeze(mean))
|
|
||||||
cov = atleast_2d(cov)
|
|
||||||
|
|
||||||
if mean.shape != (self.d,):
|
|
||||||
raise ValueError("mean does not have dimension %s" % self.d)
|
|
||||||
if cov.shape != (self.d, self.d):
|
|
||||||
raise ValueError("covariance does not have dimension %s" % self.d)
|
|
||||||
|
|
||||||
# make mean a column vector
|
|
||||||
mean = mean[:, newaxis]
|
|
||||||
|
|
||||||
sum_cov = self.covariance + cov
|
|
||||||
|
|
||||||
diff = self.dataset - mean
|
|
||||||
tdiff = dot(linalg.inv(sum_cov), diff)
|
|
||||||
|
|
||||||
energies = sum(diff * tdiff, axis=0) / 2.0
|
|
||||||
result = sum(exp(-energies), axis=0) / sqrt(linalg.det(2 * pi *
|
|
||||||
sum_cov)) / self.n
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def integrate_box_1d(self, low, high):
|
|
||||||
"""
|
|
||||||
Computes the integral of a 1D pdf between two bounds.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
low : scalar
|
|
||||||
Lower bound of integration.
|
|
||||||
high : scalar
|
|
||||||
Upper bound of integration.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
value : scalar
|
|
||||||
The result of the integral.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
ValueError
|
|
||||||
If the KDE is over more than one dimension.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if self.d != 1:
|
|
||||||
raise ValueError("integrate_box_1d() only handles 1D pdfs")
|
|
||||||
|
|
||||||
stdev = ravel(sqrt(self.covariance))[0]
|
|
||||||
|
|
||||||
normalized_low = ravel((low - self.dataset) / stdev)
|
|
||||||
normalized_high = ravel((high - self.dataset) / stdev)
|
|
||||||
|
|
||||||
value = np.mean(special.ndtr(normalized_high) -
|
|
||||||
special.ndtr(normalized_low))
|
|
||||||
return value
|
|
||||||
|
|
||||||
def integrate_box(self, low_bounds, high_bounds, maxpts=None):
|
|
||||||
"""Computes the integral of a pdf over a rectangular interval.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
low_bounds : array_like
|
|
||||||
A 1-D array containing the lower bounds of integration.
|
|
||||||
high_bounds : array_like
|
|
||||||
A 1-D array containing the upper bounds of integration.
|
|
||||||
maxpts : int, optional
|
|
||||||
The maximum number of points to use for integration.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
value : scalar
|
|
||||||
The result of the integral.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if maxpts is not None:
|
|
||||||
extra_kwds = {'maxpts': maxpts}
|
|
||||||
else:
|
|
||||||
extra_kwds = {}
|
|
||||||
|
|
||||||
value, inform = mvn.mvnun(low_bounds, high_bounds, self.dataset,
|
|
||||||
self.covariance, **extra_kwds)
|
|
||||||
if inform:
|
|
||||||
msg = ('An integral in mvn.mvnun requires more points than %s' %
|
|
||||||
(self.d * 1000))
|
|
||||||
warnings.warn(msg)
|
|
||||||
|
|
||||||
return value
|
|
||||||
|
|
||||||
def integrate_kde(self, other):
|
|
||||||
"""
|
|
||||||
Computes the integral of the product of this kernel density estimate
|
|
||||||
with another.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
other : gaussian_kde instance
|
|
||||||
The other kde.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
value : scalar
|
|
||||||
The result of the integral.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
ValueError
|
|
||||||
If the KDEs have different dimensionality.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if other.d != self.d:
|
|
||||||
raise ValueError("KDEs are not the same dimensionality")
|
|
||||||
|
|
||||||
# we want to iterate over the smallest number of points
|
|
||||||
if other.n < self.n:
|
|
||||||
small = other
|
|
||||||
large = self
|
|
||||||
else:
|
|
||||||
small = self
|
|
||||||
large = other
|
|
||||||
|
|
||||||
sum_cov = small.covariance + large.covariance
|
|
||||||
sum_cov_chol = linalg.cho_factor(sum_cov)
|
|
||||||
result = 0.0
|
|
||||||
for i in range(small.n):
|
|
||||||
mean = small.dataset[:, i, newaxis]
|
|
||||||
diff = large.dataset - mean
|
|
||||||
tdiff = linalg.cho_solve(sum_cov_chol, diff)
|
|
||||||
|
|
||||||
energies = sum(diff * tdiff, axis=0) / 2.0
|
|
||||||
result += sum(exp(-energies), axis=0)
|
|
||||||
|
|
||||||
result /= sqrt(linalg.det(2 * pi * sum_cov)) * large.n * small.n
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def resample(self, size=None):
|
|
||||||
"""
|
|
||||||
Randomly sample a dataset from the estimated pdf.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
size : int, optional
|
|
||||||
The number of samples to draw. If not provided, then the size is
|
|
||||||
the same as the underlying dataset.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
resample : (self.d, `size`) ndarray
|
|
||||||
The sampled dataset.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if size is None:
|
|
||||||
size = self.n
|
|
||||||
|
|
||||||
norm = transpose(multivariate_normal(zeros((self.d,), float),
|
|
||||||
self.covariance, size=size))
|
|
||||||
indices = randint(0, self.n, size=size)
|
|
||||||
means = self.dataset[:, indices]
|
|
||||||
|
|
||||||
return means + norm
|
|
||||||
|
|
||||||
def scotts_factor(self):
|
|
||||||
return power(self.n, -1./(self.d+4))
|
|
||||||
|
|
||||||
def silverman_factor(self):
|
|
||||||
return power(self.n*(self.d+2.0)/4.0, -1./(self.d+4))
|
|
||||||
|
|
||||||
# Default method to calculate bandwidth, can be overwritten by subclass
|
|
||||||
covariance_factor = scotts_factor
|
|
||||||
|
|
||||||
def set_bandwidth(self, bw_method=None):
|
|
||||||
"""Compute the estimator bandwidth with given method.
|
|
||||||
|
|
||||||
The new bandwidth calculated after a call to `set_bandwidth` is used
|
|
||||||
for subsequent evaluations of the estimated density.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
bw_method : str, scalar or callable, optional
|
|
||||||
The method used to calculate the estimator bandwidth. This can be
|
|
||||||
'scott', 'silverman', a scalar constant or a callable. If a
|
|
||||||
scalar, this will be used directly as `kde.factor`. If a callable,
|
|
||||||
it should take a `gaussian_kde` instance as only parameter and
|
|
||||||
return a scalar. If None (default), nothing happens; the current
|
|
||||||
`kde.covariance_factor` method is kept.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
.. versionadded:: 0.11
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
>>> x1 = np.array([-7, -5, 1, 4, 5.])
|
|
||||||
>>> kde = stats.gaussian_kde(x1)
|
|
||||||
>>> xs = np.linspace(-10, 10, num=50)
|
|
||||||
>>> y1 = kde(xs)
|
|
||||||
>>> kde.set_bandwidth(bw_method='silverman')
|
|
||||||
>>> y2 = kde(xs)
|
|
||||||
>>> kde.set_bandwidth(bw_method=kde.factor / 3.)
|
|
||||||
>>> y3 = kde(xs)
|
|
||||||
|
|
||||||
>>> fig = plt.figure()
|
|
||||||
>>> ax = fig.add_subplot(111)
|
|
||||||
>>> ax.plot(x1, np.ones(x1.shape) / (4. * x1.size), 'bo',
|
|
||||||
... label='Data points (rescaled)')
|
|
||||||
>>> ax.plot(xs, y1, label='Scott (default)')
|
|
||||||
>>> ax.plot(xs, y2, label='Silverman')
|
|
||||||
>>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
|
|
||||||
>>> ax.legend()
|
|
||||||
>>> plt.show()
|
|
||||||
|
|
||||||
"""
|
|
||||||
if bw_method is None:
|
|
||||||
pass
|
|
||||||
elif bw_method == 'scott':
|
|
||||||
self.covariance_factor = self.scotts_factor
|
|
||||||
elif bw_method == 'silverman':
|
|
||||||
self.covariance_factor = self.silverman_factor
|
|
||||||
elif np.isscalar(bw_method) and not isinstance(bw_method, string_types):
|
|
||||||
self._bw_method = 'use constant'
|
|
||||||
self.covariance_factor = lambda: bw_method
|
|
||||||
elif callable(bw_method):
|
|
||||||
self._bw_method = bw_method
|
|
||||||
self.covariance_factor = lambda: self._bw_method(self)
|
|
||||||
else:
|
|
||||||
msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
|
|
||||||
"or a callable."
|
|
||||||
raise ValueError(msg)
|
|
||||||
|
|
||||||
self._compute_covariance()
|
|
||||||
|
|
||||||
def _compute_covariance(self):
|
|
||||||
"""Computes the covariance matrix for each Gaussian kernel using
|
|
||||||
covariance_factor().
|
|
||||||
"""
|
|
||||||
self.factor = self.covariance_factor()
|
|
||||||
# Cache covariance and inverse covariance of the data
|
|
||||||
if not hasattr(self, '_data_inv_cov'):
|
|
||||||
self._data_covariance = atleast_2d(np.cov(self.dataset, rowvar=1,
|
|
||||||
bias=False))
|
|
||||||
self._data_inv_cov = linalg.inv(self._data_covariance)
|
|
||||||
|
|
||||||
self.covariance = self._data_covariance * self.factor**2
|
|
||||||
self.inv_cov = self._data_inv_cov / self.factor**2
|
|
||||||
self._norm_factor = sqrt(linalg.det(2*pi*self.covariance)) * self.n
|
|
||||||
|
|
||||||
def pdf(self, x):
|
|
||||||
"""
|
|
||||||
Evaluate the estimated pdf on a provided set of points.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
|
|
||||||
docstring for more details.
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self.evaluate(x)
|
|
||||||
|
|
||||||
def logpdf(self, x):
|
|
||||||
"""
|
|
||||||
Evaluate the log of the estimated pdf on a provided set of points.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
See `gaussian_kde.evaluate` for more details; this method simply
|
|
||||||
returns ``np.log(gaussian_kde.evaluate(x))``.
|
|
||||||
|
|
||||||
"""
|
|
||||||
return np.log(self.evaluate(x))
|
|
@ -1,15 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
Created on Tue Dec 06 16:02:47 2011
|
|
||||||
|
|
||||||
@author: pab
|
|
||||||
"""
|
|
||||||
import numpy as np
|
|
||||||
import wafo.kdetools as wk
|
|
||||||
n = 100
|
|
||||||
x = np.sort(5*np.random.rand(1,n)-2.5, axis=-1).ravel()
|
|
||||||
y = (np.cos(x)>2*np.random.rand(n, 1)-1).ravel()
|
|
||||||
|
|
||||||
kreg = wk.KRegression(x,y)
|
|
||||||
f = kreg(output='plotobj', title='Kernel regression', plotflag=1)
|
|
||||||
f.plot()
|
|
@ -1,13 +0,0 @@
|
|||||||
from numpy import asarray, ndarray, ones, nan #, reshape, repeat, product
|
|
||||||
|
|
||||||
def valarray(shape, value=nan, typecode=None):
|
|
||||||
"""Return an array of all value.
|
|
||||||
"""
|
|
||||||
#out = reshape(repeat([value],product(shape,axis=0),axis=0),shape)
|
|
||||||
out = ones(shape, dtype=bool) * value
|
|
||||||
if typecode is not None:
|
|
||||||
out = out.astype(typecode)
|
|
||||||
if not isinstance(out, ndarray):
|
|
||||||
out = asarray(out)
|
|
||||||
return out
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
@ -1,79 +0,0 @@
|
|||||||
"""
|
|
||||||
===================================================================
|
|
||||||
Statistical functions for masked arrays (:mod:`scipy.stats.mstats`)
|
|
||||||
===================================================================
|
|
||||||
|
|
||||||
.. currentmodule:: scipy.stats.mstats
|
|
||||||
|
|
||||||
This module contains a large number of statistical functions that can
|
|
||||||
be used with masked arrays.
|
|
||||||
|
|
||||||
Most of these functions are similar to those in scipy.stats but might
|
|
||||||
have small differences in the API or in the algorithm used. Since this
|
|
||||||
is a relatively new package, some API changes are still possible.
|
|
||||||
|
|
||||||
.. autosummary::
|
|
||||||
:toctree: generated/
|
|
||||||
|
|
||||||
argstoarray
|
|
||||||
betai
|
|
||||||
chisquare
|
|
||||||
count_tied_groups
|
|
||||||
describe
|
|
||||||
f_oneway
|
|
||||||
f_value_wilks_lambda
|
|
||||||
find_repeats
|
|
||||||
friedmanchisquare
|
|
||||||
kendalltau
|
|
||||||
kendalltau_seasonal
|
|
||||||
kruskalwallis
|
|
||||||
ks_twosamp
|
|
||||||
kurtosis
|
|
||||||
kurtosistest
|
|
||||||
linregress
|
|
||||||
mannwhitneyu
|
|
||||||
plotting_positions
|
|
||||||
mode
|
|
||||||
moment
|
|
||||||
mquantiles
|
|
||||||
msign
|
|
||||||
normaltest
|
|
||||||
obrientransform
|
|
||||||
pearsonr
|
|
||||||
plotting_positions
|
|
||||||
pointbiserialr
|
|
||||||
rankdata
|
|
||||||
scoreatpercentile
|
|
||||||
sem
|
|
||||||
signaltonoise
|
|
||||||
skew
|
|
||||||
skewtest
|
|
||||||
spearmanr
|
|
||||||
theilslopes
|
|
||||||
threshold
|
|
||||||
tmax
|
|
||||||
tmean
|
|
||||||
tmin
|
|
||||||
trim
|
|
||||||
trima
|
|
||||||
trimboth
|
|
||||||
trimmed_stde
|
|
||||||
trimr
|
|
||||||
trimtail
|
|
||||||
tsem
|
|
||||||
ttest_onesamp
|
|
||||||
ttest_ind
|
|
||||||
ttest_onesamp
|
|
||||||
ttest_rel
|
|
||||||
tvar
|
|
||||||
variation
|
|
||||||
winsorize
|
|
||||||
zmap
|
|
||||||
zscore
|
|
||||||
|
|
||||||
"""
|
|
||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
from .mstats_basic import *
|
|
||||||
from .mstats_extras import *
|
|
||||||
from scipy.stats import gmean, hmean
|
|
File diff suppressed because it is too large
Load Diff
@ -1,451 +0,0 @@
|
|||||||
"""
|
|
||||||
Additional statistics functions with support for masked arrays.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Original author (2007): Pierre GF Gerard-Marchant
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['compare_medians_ms',
|
|
||||||
'hdquantiles', 'hdmedian', 'hdquantiles_sd',
|
|
||||||
'idealfourths',
|
|
||||||
'median_cihs','mjci','mquantiles_cimj',
|
|
||||||
'rsh',
|
|
||||||
'trimmed_mean_ci',]
|
|
||||||
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from numpy import float_, int_, ndarray
|
|
||||||
|
|
||||||
import numpy.ma as ma
|
|
||||||
from numpy.ma import MaskedArray
|
|
||||||
|
|
||||||
from . import mstats_basic as mstats
|
|
||||||
|
|
||||||
from scipy.stats.distributions import norm, beta, t, binom
|
|
||||||
|
|
||||||
|
|
||||||
def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
|
|
||||||
"""
|
|
||||||
Computes quantile estimates with the Harrell-Davis method.
|
|
||||||
|
|
||||||
The quantile estimates are calculated as a weighted linear combination
|
|
||||||
of order statistics.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data : array_like
|
|
||||||
Data array.
|
|
||||||
prob : sequence
|
|
||||||
Sequence of quantiles to compute.
|
|
||||||
axis : int
|
|
||||||
Axis along which to compute the quantiles. If None, use a flattened
|
|
||||||
array.
|
|
||||||
var : boolean
|
|
||||||
Whether to return the variance of the estimate.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
hdquantiles : MaskedArray
|
|
||||||
A (p,) array of quantiles (if `var` is False), or a (2,p) array of
|
|
||||||
quantiles and variances (if `var` is True), where ``p`` is the
|
|
||||||
number of quantiles.
|
|
||||||
|
|
||||||
"""
|
|
||||||
def _hd_1D(data,prob,var):
|
|
||||||
"Computes the HD quantiles for a 1D array. Returns nan for invalid data."
|
|
||||||
xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
|
|
||||||
# Don't use length here, in case we have a numpy scalar
|
|
||||||
n = xsorted.size
|
|
||||||
|
|
||||||
hd = np.empty((2,len(prob)), float_)
|
|
||||||
if n < 2:
|
|
||||||
hd.flat = np.nan
|
|
||||||
if var:
|
|
||||||
return hd
|
|
||||||
return hd[0]
|
|
||||||
|
|
||||||
v = np.arange(n+1) / float(n)
|
|
||||||
betacdf = beta.cdf
|
|
||||||
for (i,p) in enumerate(prob):
|
|
||||||
_w = betacdf(v, (n+1)*p, (n+1)*(1-p))
|
|
||||||
w = _w[1:] - _w[:-1]
|
|
||||||
hd_mean = np.dot(w, xsorted)
|
|
||||||
hd[0,i] = hd_mean
|
|
||||||
#
|
|
||||||
hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
|
|
||||||
#
|
|
||||||
hd[0, prob == 0] = xsorted[0]
|
|
||||||
hd[0, prob == 1] = xsorted[-1]
|
|
||||||
if var:
|
|
||||||
hd[1, prob == 0] = hd[1, prob == 1] = np.nan
|
|
||||||
return hd
|
|
||||||
return hd[0]
|
|
||||||
# Initialization & checks
|
|
||||||
data = ma.array(data, copy=False, dtype=float_)
|
|
||||||
p = np.array(prob, copy=False, ndmin=1)
|
|
||||||
# Computes quantiles along axis (or globally)
|
|
||||||
if (axis is None) or (data.ndim == 1):
|
|
||||||
result = _hd_1D(data, p, var)
|
|
||||||
else:
|
|
||||||
if data.ndim > 2:
|
|
||||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
|
||||||
"but got data.ndim = %d" % data.ndim)
|
|
||||||
result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
|
|
||||||
|
|
||||||
return ma.fix_invalid(result, copy=False)
|
|
||||||
|
|
||||||
|
|
||||||
def hdmedian(data, axis=-1, var=False):
|
|
||||||
"""
|
|
||||||
Returns the Harrell-Davis estimate of the median along the given axis.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data : ndarray
|
|
||||||
Data array.
|
|
||||||
axis : int
|
|
||||||
Axis along which to compute the quantiles. If None, use a flattened
|
|
||||||
array.
|
|
||||||
var : boolean
|
|
||||||
Whether to return the variance of the estimate.
|
|
||||||
|
|
||||||
"""
|
|
||||||
result = hdquantiles(data,[0.5], axis=axis, var=var)
|
|
||||||
return result.squeeze()
|
|
||||||
|
|
||||||
|
|
||||||
def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
|
|
||||||
"""
|
|
||||||
The standard error of the Harrell-Davis quantile estimates by jackknife.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data : array_like
|
|
||||||
Data array.
|
|
||||||
prob : sequence
|
|
||||||
Sequence of quantiles to compute.
|
|
||||||
axis : int
|
|
||||||
Axis along which to compute the quantiles. If None, use a flattened
|
|
||||||
array.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
hdquantiles_sd : MaskedArray
|
|
||||||
Standard error of the Harrell-Davis quantile estimates.
|
|
||||||
|
|
||||||
"""
|
|
||||||
def _hdsd_1D(data,prob):
|
|
||||||
"Computes the std error for 1D arrays."
|
|
||||||
xsorted = np.sort(data.compressed())
|
|
||||||
n = len(xsorted)
|
|
||||||
#.........
|
|
||||||
hdsd = np.empty(len(prob), float_)
|
|
||||||
if n < 2:
|
|
||||||
hdsd.flat = np.nan
|
|
||||||
|
|
||||||
vv = np.arange(n) / float(n-1)
|
|
||||||
betacdf = beta.cdf
|
|
||||||
|
|
||||||
for (i,p) in enumerate(prob):
|
|
||||||
_w = betacdf(vv, (n+1)*p, (n+1)*(1-p))
|
|
||||||
w = _w[1:] - _w[:-1]
|
|
||||||
mx_ = np.fromiter([np.dot(w,xsorted[np.r_[list(range(0,k)),
|
|
||||||
list(range(k+1,n))].astype(int_)])
|
|
||||||
for k in range(n)], dtype=float_)
|
|
||||||
mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1)
|
|
||||||
hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n))
|
|
||||||
return hdsd
|
|
||||||
# Initialization & checks
|
|
||||||
data = ma.array(data, copy=False, dtype=float_)
|
|
||||||
p = np.array(prob, copy=False, ndmin=1)
|
|
||||||
# Computes quantiles along axis (or globally)
|
|
||||||
if (axis is None):
|
|
||||||
result = _hdsd_1D(data, p)
|
|
||||||
else:
|
|
||||||
if data.ndim > 2:
|
|
||||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
|
||||||
"but got data.ndim = %d" % data.ndim)
|
|
||||||
result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
|
|
||||||
|
|
||||||
return ma.fix_invalid(result, copy=False).ravel()
|
|
||||||
|
|
||||||
|
|
||||||
def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
|
|
||||||
alpha=0.05, axis=None):
|
|
||||||
"""
|
|
||||||
Selected confidence interval of the trimmed mean along the given axis.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data : array_like
|
|
||||||
Input data.
|
|
||||||
limits : {None, tuple}, optional
|
|
||||||
None or a two item tuple.
|
|
||||||
Tuple of the percentages to cut on each side of the array, with respect
|
|
||||||
to the number of unmasked data, as floats between 0. and 1. If ``n``
|
|
||||||
is the number of unmasked data before trimming, then
|
|
||||||
(``n * limits[0]``)th smallest data and (``n * limits[1]``)th
|
|
||||||
largest data are masked. The total number of unmasked data after
|
|
||||||
trimming is ``n * (1. - sum(limits))``.
|
|
||||||
The value of one limit can be set to None to indicate an open interval.
|
|
||||||
|
|
||||||
Defaults to (0.2, 0.2).
|
|
||||||
inclusive : (2,) tuple of boolean, optional
|
|
||||||
If relative==False, tuple indicating whether values exactly equal to
|
|
||||||
the absolute limits are allowed.
|
|
||||||
If relative==True, tuple indicating whether the number of data being
|
|
||||||
masked on each side should be rounded (True) or truncated (False).
|
|
||||||
|
|
||||||
Defaults to (True, True).
|
|
||||||
alpha : float, optional
|
|
||||||
Confidence level of the intervals.
|
|
||||||
|
|
||||||
Defaults to 0.05.
|
|
||||||
axis : int, optional
|
|
||||||
Axis along which to cut. If None, uses a flattened version of `data`.
|
|
||||||
|
|
||||||
Defaults to None.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
trimmed_mean_ci : (2,) ndarray
|
|
||||||
The lower and upper confidence intervals of the trimmed data.
|
|
||||||
|
|
||||||
"""
|
|
||||||
data = ma.array(data, copy=False)
|
|
||||||
trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
|
|
||||||
tmean = trimmed.mean(axis)
|
|
||||||
tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
|
|
||||||
df = trimmed.count(axis) - 1
|
|
||||||
tppf = t.ppf(1-alpha/2.,df)
|
|
||||||
return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
|
|
||||||
|
|
||||||
|
|
||||||
def mjci(data, prob=[0.25,0.5,0.75], axis=None):
|
|
||||||
"""
|
|
||||||
Returns the Maritz-Jarrett estimators of the standard error of selected
|
|
||||||
experimental quantiles of the data.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data: ndarray
|
|
||||||
Data array.
|
|
||||||
prob: sequence
|
|
||||||
Sequence of quantiles to compute.
|
|
||||||
axis : int
|
|
||||||
Axis along which to compute the quantiles. If None, use a flattened
|
|
||||||
array.
|
|
||||||
|
|
||||||
"""
|
|
||||||
def _mjci_1D(data, p):
|
|
||||||
data = np.sort(data.compressed())
|
|
||||||
n = data.size
|
|
||||||
prob = (np.array(p) * n + 0.5).astype(int_)
|
|
||||||
betacdf = beta.cdf
|
|
||||||
|
|
||||||
mj = np.empty(len(prob), float_)
|
|
||||||
x = np.arange(1,n+1, dtype=float_) / n
|
|
||||||
y = x - 1./n
|
|
||||||
for (i,m) in enumerate(prob):
|
|
||||||
(m1,m2) = (m-1, n-m)
|
|
||||||
W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
|
|
||||||
C1 = np.dot(W,data)
|
|
||||||
C2 = np.dot(W,data**2)
|
|
||||||
mj[i] = np.sqrt(C2 - C1**2)
|
|
||||||
return mj
|
|
||||||
|
|
||||||
data = ma.array(data, copy=False)
|
|
||||||
if data.ndim > 2:
|
|
||||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
|
||||||
"but got data.ndim = %d" % data.ndim)
|
|
||||||
|
|
||||||
p = np.array(prob, copy=False, ndmin=1)
|
|
||||||
# Computes quantiles along axis (or globally)
|
|
||||||
if (axis is None):
|
|
||||||
return _mjci_1D(data, p)
|
|
||||||
else:
|
|
||||||
return ma.apply_along_axis(_mjci_1D, axis, data, p)
|
|
||||||
|
|
||||||
|
|
||||||
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
|
|
||||||
"""
|
|
||||||
Computes the alpha confidence interval for the selected quantiles of the
|
|
||||||
data, with Maritz-Jarrett estimators.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data : ndarray
|
|
||||||
Data array.
|
|
||||||
prob : sequence
|
|
||||||
Sequence of quantiles to compute.
|
|
||||||
alpha : float
|
|
||||||
Confidence level of the intervals.
|
|
||||||
axis : integer
|
|
||||||
Axis along which to compute the quantiles.
|
|
||||||
If None, use a flattened array.
|
|
||||||
|
|
||||||
"""
|
|
||||||
alpha = min(alpha, 1-alpha)
|
|
||||||
z = norm.ppf(1-alpha/2.)
|
|
||||||
xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
|
|
||||||
smj = mjci(data, prob, axis=axis)
|
|
||||||
return (xq - z * smj, xq + z * smj)
|
|
||||||
|
|
||||||
|
|
||||||
def median_cihs(data, alpha=0.05, axis=None):
|
|
||||||
"""
|
|
||||||
Computes the alpha-level confidence interval for the median of the data.
|
|
||||||
|
|
||||||
Uses the Hettmasperger-Sheather method.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data : array_like
|
|
||||||
Input data. Masked values are discarded. The input should be 1D only,
|
|
||||||
or `axis` should be set to None.
|
|
||||||
alpha : float
|
|
||||||
Confidence level of the intervals.
|
|
||||||
axis : integer
|
|
||||||
Axis along which to compute the quantiles. If None, use a flattened
|
|
||||||
array.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
median_cihs :
|
|
||||||
Alpha level confidence interval.
|
|
||||||
|
|
||||||
"""
|
|
||||||
def _cihs_1D(data, alpha):
|
|
||||||
data = np.sort(data.compressed())
|
|
||||||
n = len(data)
|
|
||||||
alpha = min(alpha, 1-alpha)
|
|
||||||
k = int(binom._ppf(alpha/2., n, 0.5))
|
|
||||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
|
||||||
if gk < 1-alpha:
|
|
||||||
k -= 1
|
|
||||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
|
||||||
gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
|
|
||||||
I = (gk - 1 + alpha)/(gk - gkk)
|
|
||||||
lambd = (n-k) * I / float(k + (n-2*k)*I)
|
|
||||||
lims = (lambd*data[k] + (1-lambd)*data[k-1],
|
|
||||||
lambd*data[n-k-1] + (1-lambd)*data[n-k])
|
|
||||||
return lims
|
|
||||||
data = ma.rray(data, copy=False)
|
|
||||||
# Computes quantiles along axis (or globally)
|
|
||||||
if (axis is None):
|
|
||||||
result = _cihs_1D(data.compressed(), alpha)
|
|
||||||
else:
|
|
||||||
if data.ndim > 2:
|
|
||||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
|
||||||
"but got data.ndim = %d" % data.ndim)
|
|
||||||
result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def compare_medians_ms(group_1, group_2, axis=None):
|
|
||||||
"""
|
|
||||||
Compares the medians from two independent groups along the given axis.
|
|
||||||
|
|
||||||
The comparison is performed using the McKean-Schrader estimate of the
|
|
||||||
standard error of the medians.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
group_1 : array_like
|
|
||||||
First dataset.
|
|
||||||
group_2 : array_like
|
|
||||||
Second dataset.
|
|
||||||
axis : int, optional
|
|
||||||
Axis along which the medians are estimated. If None, the arrays are
|
|
||||||
flattened. If `axis` is not None, then `group_1` and `group_2`
|
|
||||||
should have the same shape.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
compare_medians_ms : {float, ndarray}
|
|
||||||
If `axis` is None, then returns a float, otherwise returns a 1-D
|
|
||||||
ndarray of floats with a length equal to the length of `group_1`
|
|
||||||
along `axis`.
|
|
||||||
|
|
||||||
"""
|
|
||||||
(med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
|
|
||||||
(std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
|
|
||||||
mstats.stde_median(group_2, axis=axis))
|
|
||||||
W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
|
|
||||||
return 1 - norm.cdf(W)
|
|
||||||
|
|
||||||
|
|
||||||
def idealfourths(data, axis=None):
|
|
||||||
"""
|
|
||||||
Returns an estimate of the lower and upper quartiles.
|
|
||||||
|
|
||||||
Uses the ideal fourths algorithm.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data : array_like
|
|
||||||
Input array.
|
|
||||||
axis : int, optional
|
|
||||||
Axis along which the quartiles are estimated. If None, the arrays are
|
|
||||||
flattened.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
idealfourths : {list of floats, masked array}
|
|
||||||
Returns the two internal values that divide `data` into four parts
|
|
||||||
using the ideal fourths algorithm either along the flattened array
|
|
||||||
(if `axis` is None) or along `axis` of `data`.
|
|
||||||
|
|
||||||
"""
|
|
||||||
def _idf(data):
|
|
||||||
x = data.compressed()
|
|
||||||
n = len(x)
|
|
||||||
if n < 3:
|
|
||||||
return [np.nan,np.nan]
|
|
||||||
(j,h) = divmod(n/4. + 5/12.,1)
|
|
||||||
j = int(j)
|
|
||||||
qlo = (1-h)*x[j-1] + h*x[j]
|
|
||||||
k = n - j
|
|
||||||
qup = (1-h)*x[k] + h*x[k-1]
|
|
||||||
return [qlo, qup]
|
|
||||||
data = ma.sort(data, axis=axis).view(MaskedArray)
|
|
||||||
if (axis is None):
|
|
||||||
return _idf(data)
|
|
||||||
else:
|
|
||||||
return ma.apply_along_axis(_idf, axis, data)
|
|
||||||
|
|
||||||
|
|
||||||
def rsh(data, points=None):
|
|
||||||
"""
|
|
||||||
Evaluates Rosenblatt's shifted histogram estimators for each point
|
|
||||||
on the dataset 'data'.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data : sequence
|
|
||||||
Input data. Masked values are ignored.
|
|
||||||
points : sequence
|
|
||||||
Sequence of points where to evaluate Rosenblatt shifted histogram.
|
|
||||||
If None, use the data.
|
|
||||||
|
|
||||||
"""
|
|
||||||
data = ma.array(data, copy=False)
|
|
||||||
if points is None:
|
|
||||||
points = data
|
|
||||||
else:
|
|
||||||
points = np.array(points, copy=False, ndmin=1)
|
|
||||||
|
|
||||||
if data.ndim != 1:
|
|
||||||
raise AttributeError("The input array should be 1D only !")
|
|
||||||
|
|
||||||
n = data.count()
|
|
||||||
r = idealfourths(data, axis=None)
|
|
||||||
h = 1.2 * (r[-1]-r[0]) / n**(1./5)
|
|
||||||
nhi = (data[:,None] <= points[None,:] + h).sum(0)
|
|
||||||
nlo = (data[:,None] < points[None,:] - h).sum(0)
|
|
||||||
return (nhi-nlo) / (2.*n*h)
|
|
@ -1,76 +0,0 @@
|
|||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
from numpy import vectorize, deprecate
|
|
||||||
from numpy.random import random_sample
|
|
||||||
|
|
||||||
__all__ = ['randwppf', 'randwcdf']
|
|
||||||
|
|
||||||
# XXX: Are these needed anymore?
|
|
||||||
|
|
||||||
#####################################
|
|
||||||
# General purpose continuous
|
|
||||||
######################################
|
|
||||||
|
|
||||||
|
|
||||||
@deprecate(message="Deprecated in scipy 0.14.0, use "
|
|
||||||
"distribution-specific rvs() method instead")
|
|
||||||
def randwppf(ppf, args=(), size=None):
|
|
||||||
"""
|
|
||||||
returns an array of randomly distributed integers of a distribution
|
|
||||||
whose percent point function (inverse of the CDF or quantile function)
|
|
||||||
is given.
|
|
||||||
|
|
||||||
args is a tuple of extra arguments to the ppf function (i.e. shape,
|
|
||||||
location, scale), and size is the size of the output. Note the ppf
|
|
||||||
function must accept an array of q values to compute over.
|
|
||||||
|
|
||||||
"""
|
|
||||||
U = random_sample(size=size)
|
|
||||||
return ppf(*(U,)+args)
|
|
||||||
|
|
||||||
|
|
||||||
@deprecate(message="Deprecated in scipy 0.14.0, use "
|
|
||||||
"distribution-specific rvs() method instead")
|
|
||||||
def randwcdf(cdf, mean=1.0, args=(), size=None):
|
|
||||||
"""
|
|
||||||
Returns an array of randomly distributed integers given a CDF.
|
|
||||||
|
|
||||||
Given a cumulative distribution function (CDF) returns an array of
|
|
||||||
randomly distributed integers that would satisfy the CDF.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
cdf : function
|
|
||||||
CDF function that accepts a single value and `args`, and returns
|
|
||||||
an single value.
|
|
||||||
mean : float, optional
|
|
||||||
The mean of the distribution which helps the solver. Defaults
|
|
||||||
to 1.0.
|
|
||||||
args : tuple, optional
|
|
||||||
Extra arguments to the cdf function (i.e. shape, location, scale)
|
|
||||||
size : {int, None}, optional
|
|
||||||
Is the size of the output. If None, only 1 value will be returned.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
randwcdf : ndarray
|
|
||||||
Array of random numbers.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
Can use the ``scipy.stats.distributions.*.cdf`` functions for the
|
|
||||||
`cdf` parameter.
|
|
||||||
|
|
||||||
"""
|
|
||||||
import scipy.optimize as optimize
|
|
||||||
|
|
||||||
def _ppfopt(x, q, *nargs):
|
|
||||||
newargs = (x,)+nargs
|
|
||||||
return cdf(*newargs) - q
|
|
||||||
|
|
||||||
def _ppf(q, *nargs):
|
|
||||||
return optimize.fsolve(_ppfopt, mean, args=(q,)+nargs)
|
|
||||||
|
|
||||||
_vppf = vectorize(_ppf)
|
|
||||||
U = random_sample(size=size)
|
|
||||||
return _vppf(*(U,)+args)
|
|
@ -1,389 +0,0 @@
|
|||||||
"""Utilities for writing code that runs on Python 2 and 3"""
|
|
||||||
|
|
||||||
# Copyright (c) 2010-2012 Benjamin Peterson
|
|
||||||
#
|
|
||||||
# Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
||||||
# this software and associated documentation files (the "Software"), to deal in
|
|
||||||
# the Software without restriction, including without limitation the rights to
|
|
||||||
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
||||||
# the Software, and to permit persons to whom the Software is furnished to do so,
|
|
||||||
# subject to the following conditions:
|
|
||||||
#
|
|
||||||
# The above copyright notice and this permission notice shall be included in all
|
|
||||||
# copies or substantial portions of the Software.
|
|
||||||
#
|
|
||||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
||||||
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
||||||
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
||||||
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
||||||
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
|
|
||||||
import operator
|
|
||||||
import sys
|
|
||||||
import types
|
|
||||||
|
|
||||||
__author__ = "Benjamin Peterson <benjamin@python.org>"
|
|
||||||
__version__ = "1.2.0"
|
|
||||||
|
|
||||||
|
|
||||||
# True if we are running on Python 3.
|
|
||||||
PY3 = sys.version_info[0] == 3
|
|
||||||
|
|
||||||
if PY3:
|
|
||||||
string_types = str,
|
|
||||||
integer_types = int,
|
|
||||||
class_types = type,
|
|
||||||
text_type = str
|
|
||||||
binary_type = bytes
|
|
||||||
|
|
||||||
MAXSIZE = sys.maxsize
|
|
||||||
else:
|
|
||||||
string_types = basestring,
|
|
||||||
integer_types = (int, long)
|
|
||||||
class_types = (type, types.ClassType)
|
|
||||||
text_type = unicode
|
|
||||||
binary_type = str
|
|
||||||
|
|
||||||
if sys.platform.startswith("java"):
|
|
||||||
# Jython always uses 32 bits.
|
|
||||||
MAXSIZE = int((1 << 31) - 1)
|
|
||||||
else:
|
|
||||||
# It's possible to have sizeof(long) != sizeof(Py_ssize_t).
|
|
||||||
class X(object):
|
|
||||||
def __len__(self):
|
|
||||||
return 1 << 31
|
|
||||||
try:
|
|
||||||
len(X())
|
|
||||||
except OverflowError:
|
|
||||||
# 32-bit
|
|
||||||
MAXSIZE = int((1 << 31) - 1)
|
|
||||||
else:
|
|
||||||
# 64-bit
|
|
||||||
MAXSIZE = int((1 << 63) - 1)
|
|
||||||
del X
|
|
||||||
|
|
||||||
|
|
||||||
def _add_doc(func, doc):
|
|
||||||
"""Add documentation to a function."""
|
|
||||||
func.__doc__ = doc
|
|
||||||
|
|
||||||
|
|
||||||
def _import_module(name):
|
|
||||||
"""Import module, returning the module after the last dot."""
|
|
||||||
__import__(name)
|
|
||||||
return sys.modules[name]
|
|
||||||
|
|
||||||
|
|
||||||
class _LazyDescr(object):
|
|
||||||
|
|
||||||
def __init__(self, name):
|
|
||||||
self.name = name
|
|
||||||
|
|
||||||
def __get__(self, obj, tp):
|
|
||||||
result = self._resolve()
|
|
||||||
setattr(obj, self.name, result)
|
|
||||||
# This is a bit ugly, but it avoids running this again.
|
|
||||||
delattr(tp, self.name)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
class MovedModule(_LazyDescr):
|
|
||||||
|
|
||||||
def __init__(self, name, old, new=None):
|
|
||||||
super(MovedModule, self).__init__(name)
|
|
||||||
if PY3:
|
|
||||||
if new is None:
|
|
||||||
new = name
|
|
||||||
self.mod = new
|
|
||||||
else:
|
|
||||||
self.mod = old
|
|
||||||
|
|
||||||
def _resolve(self):
|
|
||||||
return _import_module(self.mod)
|
|
||||||
|
|
||||||
|
|
||||||
class MovedAttribute(_LazyDescr):
|
|
||||||
|
|
||||||
def __init__(self, name, old_mod, new_mod, old_attr=None, new_attr=None):
|
|
||||||
super(MovedAttribute, self).__init__(name)
|
|
||||||
if PY3:
|
|
||||||
if new_mod is None:
|
|
||||||
new_mod = name
|
|
||||||
self.mod = new_mod
|
|
||||||
if new_attr is None:
|
|
||||||
if old_attr is None:
|
|
||||||
new_attr = name
|
|
||||||
else:
|
|
||||||
new_attr = old_attr
|
|
||||||
self.attr = new_attr
|
|
||||||
else:
|
|
||||||
self.mod = old_mod
|
|
||||||
if old_attr is None:
|
|
||||||
old_attr = name
|
|
||||||
self.attr = old_attr
|
|
||||||
|
|
||||||
def _resolve(self):
|
|
||||||
module = _import_module(self.mod)
|
|
||||||
return getattr(module, self.attr)
|
|
||||||
|
|
||||||
|
|
||||||
class _MovedItems(types.ModuleType):
|
|
||||||
"""Lazy loading of moved objects"""
|
|
||||||
|
|
||||||
|
|
||||||
_moved_attributes = [
|
|
||||||
MovedAttribute("cStringIO", "cStringIO", "io", "StringIO"),
|
|
||||||
MovedAttribute("filter", "itertools", "builtins", "ifilter", "filter"),
|
|
||||||
MovedAttribute("input", "__builtin__", "builtins", "raw_input", "input"),
|
|
||||||
MovedAttribute("map", "itertools", "builtins", "imap", "map"),
|
|
||||||
MovedAttribute("reload_module", "__builtin__", "imp", "reload"),
|
|
||||||
MovedAttribute("reduce", "__builtin__", "functools"),
|
|
||||||
MovedAttribute("StringIO", "StringIO", "io"),
|
|
||||||
MovedAttribute("xrange", "__builtin__", "builtins", "xrange", "range"),
|
|
||||||
MovedAttribute("zip", "itertools", "builtins", "izip", "zip"),
|
|
||||||
|
|
||||||
MovedModule("builtins", "__builtin__"),
|
|
||||||
MovedModule("configparser", "ConfigParser"),
|
|
||||||
MovedModule("copyreg", "copy_reg"),
|
|
||||||
MovedModule("http_cookiejar", "cookielib", "http.cookiejar"),
|
|
||||||
MovedModule("http_cookies", "Cookie", "http.cookies"),
|
|
||||||
MovedModule("html_entities", "htmlentitydefs", "html.entities"),
|
|
||||||
MovedModule("html_parser", "HTMLParser", "html.parser"),
|
|
||||||
MovedModule("http_client", "httplib", "http.client"),
|
|
||||||
MovedModule("email_mime_multipart", "email.MIMEMultipart", "email.mime.multipart"),
|
|
||||||
MovedModule("email_mime_text", "email.MIMEText", "email.mime.text"),
|
|
||||||
MovedModule("email_mime_base", "email.MIMEBase", "email.mime.base"),
|
|
||||||
MovedModule("BaseHTTPServer", "BaseHTTPServer", "http.server"),
|
|
||||||
MovedModule("CGIHTTPServer", "CGIHTTPServer", "http.server"),
|
|
||||||
MovedModule("SimpleHTTPServer", "SimpleHTTPServer", "http.server"),
|
|
||||||
MovedModule("cPickle", "cPickle", "pickle"),
|
|
||||||
MovedModule("queue", "Queue"),
|
|
||||||
MovedModule("reprlib", "repr"),
|
|
||||||
MovedModule("socketserver", "SocketServer"),
|
|
||||||
MovedModule("tkinter", "Tkinter"),
|
|
||||||
MovedModule("tkinter_dialog", "Dialog", "tkinter.dialog"),
|
|
||||||
MovedModule("tkinter_filedialog", "FileDialog", "tkinter.filedialog"),
|
|
||||||
MovedModule("tkinter_scrolledtext", "ScrolledText", "tkinter.scrolledtext"),
|
|
||||||
MovedModule("tkinter_simpledialog", "SimpleDialog", "tkinter.simpledialog"),
|
|
||||||
MovedModule("tkinter_tix", "Tix", "tkinter.tix"),
|
|
||||||
MovedModule("tkinter_constants", "Tkconstants", "tkinter.constants"),
|
|
||||||
MovedModule("tkinter_dnd", "Tkdnd", "tkinter.dnd"),
|
|
||||||
MovedModule("tkinter_colorchooser", "tkColorChooser",
|
|
||||||
"tkinter.colorchooser"),
|
|
||||||
MovedModule("tkinter_commondialog", "tkCommonDialog",
|
|
||||||
"tkinter.commondialog"),
|
|
||||||
MovedModule("tkinter_tkfiledialog", "tkFileDialog", "tkinter.filedialog"),
|
|
||||||
MovedModule("tkinter_font", "tkFont", "tkinter.font"),
|
|
||||||
MovedModule("tkinter_messagebox", "tkMessageBox", "tkinter.messagebox"),
|
|
||||||
MovedModule("tkinter_tksimpledialog", "tkSimpleDialog",
|
|
||||||
"tkinter.simpledialog"),
|
|
||||||
MovedModule("urllib_robotparser", "robotparser", "urllib.robotparser"),
|
|
||||||
MovedModule("winreg", "_winreg"),
|
|
||||||
]
|
|
||||||
for attr in _moved_attributes:
|
|
||||||
setattr(_MovedItems, attr.name, attr)
|
|
||||||
del attr
|
|
||||||
|
|
||||||
moves = sys.modules[__name__ + ".moves"] = _MovedItems("moves")
|
|
||||||
|
|
||||||
|
|
||||||
def add_move(move):
|
|
||||||
"""Add an item to six.moves."""
|
|
||||||
setattr(_MovedItems, move.name, move)
|
|
||||||
|
|
||||||
|
|
||||||
def remove_move(name):
|
|
||||||
"""Remove item from six.moves."""
|
|
||||||
try:
|
|
||||||
delattr(_MovedItems, name)
|
|
||||||
except AttributeError:
|
|
||||||
try:
|
|
||||||
del moves.__dict__[name]
|
|
||||||
except KeyError:
|
|
||||||
raise AttributeError("no such move, %r" % (name,))
|
|
||||||
|
|
||||||
|
|
||||||
if PY3:
|
|
||||||
_meth_func = "__func__"
|
|
||||||
_meth_self = "__self__"
|
|
||||||
|
|
||||||
_func_code = "__code__"
|
|
||||||
_func_defaults = "__defaults__"
|
|
||||||
|
|
||||||
_iterkeys = "keys"
|
|
||||||
_itervalues = "values"
|
|
||||||
_iteritems = "items"
|
|
||||||
else:
|
|
||||||
_meth_func = "im_func"
|
|
||||||
_meth_self = "im_self"
|
|
||||||
|
|
||||||
_func_code = "func_code"
|
|
||||||
_func_defaults = "func_defaults"
|
|
||||||
|
|
||||||
_iterkeys = "iterkeys"
|
|
||||||
_itervalues = "itervalues"
|
|
||||||
_iteritems = "iteritems"
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
advance_iterator = next
|
|
||||||
except NameError:
|
|
||||||
def advance_iterator(it):
|
|
||||||
return it.next()
|
|
||||||
next = advance_iterator
|
|
||||||
|
|
||||||
|
|
||||||
if PY3:
|
|
||||||
def get_unbound_function(unbound):
|
|
||||||
return unbound
|
|
||||||
|
|
||||||
Iterator = object
|
|
||||||
|
|
||||||
def callable(obj):
|
|
||||||
return any("__call__" in klass.__dict__ for klass in type(obj).__mro__)
|
|
||||||
else:
|
|
||||||
def get_unbound_function(unbound):
|
|
||||||
return unbound.im_func
|
|
||||||
|
|
||||||
class Iterator(object):
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
return type(self).__next__(self)
|
|
||||||
|
|
||||||
callable = callable
|
|
||||||
_add_doc(get_unbound_function,
|
|
||||||
"""Get the function out of a possibly unbound function""")
|
|
||||||
|
|
||||||
|
|
||||||
get_method_function = operator.attrgetter(_meth_func)
|
|
||||||
get_method_self = operator.attrgetter(_meth_self)
|
|
||||||
get_function_code = operator.attrgetter(_func_code)
|
|
||||||
get_function_defaults = operator.attrgetter(_func_defaults)
|
|
||||||
|
|
||||||
|
|
||||||
def iterkeys(d):
|
|
||||||
"""Return an iterator over the keys of a dictionary."""
|
|
||||||
return iter(getattr(d, _iterkeys)())
|
|
||||||
|
|
||||||
|
|
||||||
def itervalues(d):
|
|
||||||
"""Return an iterator over the values of a dictionary."""
|
|
||||||
return iter(getattr(d, _itervalues)())
|
|
||||||
|
|
||||||
|
|
||||||
def iteritems(d):
|
|
||||||
"""Return an iterator over the (key, value) pairs of a dictionary."""
|
|
||||||
return iter(getattr(d, _iteritems)())
|
|
||||||
|
|
||||||
|
|
||||||
if PY3:
|
|
||||||
def b(s):
|
|
||||||
return s.encode("latin-1")
|
|
||||||
|
|
||||||
def u(s):
|
|
||||||
return s
|
|
||||||
|
|
||||||
if sys.version_info[1] <= 1:
|
|
||||||
def int2byte(i):
|
|
||||||
return bytes((i,))
|
|
||||||
else:
|
|
||||||
# This is about 2x faster than the implementation above on 3.2+
|
|
||||||
int2byte = operator.methodcaller("to_bytes", 1, "big")
|
|
||||||
import io
|
|
||||||
StringIO = io.StringIO
|
|
||||||
BytesIO = io.BytesIO
|
|
||||||
else:
|
|
||||||
def b(s):
|
|
||||||
return s
|
|
||||||
|
|
||||||
def u(s):
|
|
||||||
return unicode(s, "unicode_escape")
|
|
||||||
int2byte = chr
|
|
||||||
import StringIO
|
|
||||||
StringIO = BytesIO = StringIO.StringIO
|
|
||||||
_add_doc(b, """Byte literal""")
|
|
||||||
_add_doc(u, """Text literal""")
|
|
||||||
|
|
||||||
|
|
||||||
if PY3:
|
|
||||||
import builtins # @UnresolvedImport
|
|
||||||
exec_ = getattr(builtins, "exec")
|
|
||||||
|
|
||||||
def reraise(tp, value, tb=None):
|
|
||||||
if value.__traceback__ is not tb:
|
|
||||||
raise value.with_traceback(tb)
|
|
||||||
raise value
|
|
||||||
|
|
||||||
print_ = getattr(builtins, "print")
|
|
||||||
del builtins
|
|
||||||
|
|
||||||
else:
|
|
||||||
def exec_(code, globs=None, locs=None):
|
|
||||||
"""Execute code in a namespace."""
|
|
||||||
if globs is None:
|
|
||||||
frame = sys._getframe(1)
|
|
||||||
globs = frame.f_globals
|
|
||||||
if locs is None:
|
|
||||||
locs = frame.f_locals
|
|
||||||
del frame
|
|
||||||
elif locs is None:
|
|
||||||
locs = globs
|
|
||||||
exec("""exec code in globs, locs""")
|
|
||||||
|
|
||||||
exec_("""def reraise(tp, value, tb=None):
|
|
||||||
raise tp, value, tb
|
|
||||||
""")
|
|
||||||
|
|
||||||
def print_(*args, **kwargs):
|
|
||||||
"""The new-style print function."""
|
|
||||||
fp = kwargs.pop("file", sys.stdout)
|
|
||||||
if fp is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
def write(data):
|
|
||||||
if not isinstance(data, basestring):
|
|
||||||
data = str(data)
|
|
||||||
fp.write(data)
|
|
||||||
want_unicode = False
|
|
||||||
sep = kwargs.pop("sep", None)
|
|
||||||
if sep is not None:
|
|
||||||
if isinstance(sep, unicode):
|
|
||||||
want_unicode = True
|
|
||||||
elif not isinstance(sep, str):
|
|
||||||
raise TypeError("sep must be None or a string")
|
|
||||||
end = kwargs.pop("end", None)
|
|
||||||
if end is not None:
|
|
||||||
if isinstance(end, unicode):
|
|
||||||
want_unicode = True
|
|
||||||
elif not isinstance(end, str):
|
|
||||||
raise TypeError("end must be None or a string")
|
|
||||||
if kwargs:
|
|
||||||
raise TypeError("invalid keyword arguments to print()")
|
|
||||||
if not want_unicode:
|
|
||||||
for arg in args:
|
|
||||||
if isinstance(arg, unicode):
|
|
||||||
want_unicode = True
|
|
||||||
break
|
|
||||||
if want_unicode:
|
|
||||||
newline = unicode("\n")
|
|
||||||
space = unicode(" ")
|
|
||||||
else:
|
|
||||||
newline = "\n"
|
|
||||||
space = " "
|
|
||||||
if sep is None:
|
|
||||||
sep = space
|
|
||||||
if end is None:
|
|
||||||
end = newline
|
|
||||||
for i, arg in enumerate(args):
|
|
||||||
if i:
|
|
||||||
write(sep)
|
|
||||||
write(arg)
|
|
||||||
write(end)
|
|
||||||
|
|
||||||
_add_doc(reraise, """Reraise an exception.""")
|
|
||||||
|
|
||||||
|
|
||||||
def with_metaclass(meta, base=object):
|
|
||||||
"""Create a base class with a metaclass."""
|
|
||||||
return meta("NewBase", (base,), {})
|
|
File diff suppressed because it is too large
Load Diff
@ -1,238 +0,0 @@
|
|||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from numpy.testing import assert_array_almost_equal, run_module_suite
|
|
||||||
from scipy.stats import \
|
|
||||||
binned_statistic, binned_statistic_2d, binned_statistic_dd
|
|
||||||
|
|
||||||
|
|
||||||
class TestBinnedStatistic(object):
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def setup_class(cls):
|
|
||||||
np.random.seed(9865)
|
|
||||||
cls.x = np.random.random(100)
|
|
||||||
cls.y = np.random.random(100)
|
|
||||||
cls.v = np.random.random(100)
|
|
||||||
cls.X = np.random.random((100, 3))
|
|
||||||
|
|
||||||
def test_1d_count(self):
|
|
||||||
x = self.x
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
count1, edges1, bc = binned_statistic(x, v, 'count', bins=10)
|
|
||||||
count2, edges2 = np.histogram(x, bins=10)
|
|
||||||
|
|
||||||
assert_array_almost_equal(count1, count2)
|
|
||||||
assert_array_almost_equal(edges1, edges2)
|
|
||||||
|
|
||||||
def test_1d_sum(self):
|
|
||||||
x = self.x
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
sum1, edges1, bc = binned_statistic(x, v, 'sum', bins=10)
|
|
||||||
sum2, edges2 = np.histogram(x, bins=10, weights=v)
|
|
||||||
|
|
||||||
assert_array_almost_equal(sum1, sum2)
|
|
||||||
assert_array_almost_equal(edges1, edges2)
|
|
||||||
|
|
||||||
def test_1d_mean(self):
|
|
||||||
x = self.x
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
stat1, edges1, bc = binned_statistic(x, v, 'mean', bins=10)
|
|
||||||
stat2, edges2, bc = binned_statistic(x, v, np.mean, bins=10)
|
|
||||||
|
|
||||||
assert_array_almost_equal(stat1, stat2)
|
|
||||||
assert_array_almost_equal(edges1, edges2)
|
|
||||||
|
|
||||||
def test_1d_std(self):
|
|
||||||
x = self.x
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
stat1, edges1, bc = binned_statistic(x, v, 'std', bins=10)
|
|
||||||
stat2, edges2, bc = binned_statistic(x, v, np.std, bins=10)
|
|
||||||
|
|
||||||
assert_array_almost_equal(stat1, stat2)
|
|
||||||
assert_array_almost_equal(edges1, edges2)
|
|
||||||
|
|
||||||
def test_1d_median(self):
|
|
||||||
x = self.x
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
stat1, edges1, bc = binned_statistic(x, v, 'median', bins=10)
|
|
||||||
stat2, edges2, bc = binned_statistic(x, v, np.median, bins=10)
|
|
||||||
|
|
||||||
assert_array_almost_equal(stat1, stat2)
|
|
||||||
assert_array_almost_equal(edges1, edges2)
|
|
||||||
|
|
||||||
def test_1d_bincode(self):
|
|
||||||
x = self.x[:20]
|
|
||||||
v = self.v[:20]
|
|
||||||
|
|
||||||
count1, edges1, bc = binned_statistic(x, v, 'count', bins=3)
|
|
||||||
bc2 = np.array([3, 2, 1, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 1, 2, 3, 1,
|
|
||||||
1, 2, 1])
|
|
||||||
|
|
||||||
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
|
||||||
|
|
||||||
assert_array_almost_equal(bc, bc2)
|
|
||||||
assert_array_almost_equal(bcount, count1)
|
|
||||||
|
|
||||||
def test_1d_range_keyword(self):
|
|
||||||
# Regression test for gh-3063, range can be (min, max) or [(min, max)]
|
|
||||||
np.random.seed(9865)
|
|
||||||
x = np.arange(30)
|
|
||||||
data = np.random.random(30)
|
|
||||||
|
|
||||||
mean, bins, _ = binned_statistic(x[:15], data[:15])
|
|
||||||
mean_range, bins_range, _ = binned_statistic(x, data, range=[(0, 14)])
|
|
||||||
mean_range2, bins_range2, _ = binned_statistic(x, data, range=(0, 14))
|
|
||||||
|
|
||||||
assert_array_almost_equal(mean, mean_range)
|
|
||||||
assert_array_almost_equal(bins, bins_range)
|
|
||||||
assert_array_almost_equal(mean, mean_range2)
|
|
||||||
assert_array_almost_equal(bins, bins_range2)
|
|
||||||
|
|
||||||
def test_2d_count(self):
|
|
||||||
x = self.x
|
|
||||||
y = self.y
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
count1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'count', bins=5)
|
|
||||||
count2, binx2, biny2 = np.histogram2d(x, y, bins=5)
|
|
||||||
|
|
||||||
assert_array_almost_equal(count1, count2)
|
|
||||||
assert_array_almost_equal(binx1, binx2)
|
|
||||||
assert_array_almost_equal(biny1, biny2)
|
|
||||||
|
|
||||||
def test_2d_sum(self):
|
|
||||||
x = self.x
|
|
||||||
y = self.y
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
sum1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'sum', bins=5)
|
|
||||||
sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v)
|
|
||||||
|
|
||||||
assert_array_almost_equal(sum1, sum2)
|
|
||||||
assert_array_almost_equal(binx1, binx2)
|
|
||||||
assert_array_almost_equal(biny1, biny2)
|
|
||||||
|
|
||||||
def test_2d_mean(self):
|
|
||||||
x = self.x
|
|
||||||
y = self.y
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'mean', bins=5)
|
|
||||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
|
|
||||||
|
|
||||||
assert_array_almost_equal(stat1, stat2)
|
|
||||||
assert_array_almost_equal(binx1, binx2)
|
|
||||||
assert_array_almost_equal(biny1, biny2)
|
|
||||||
|
|
||||||
def test_2d_std(self):
|
|
||||||
x = self.x
|
|
||||||
y = self.y
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'std', bins=5)
|
|
||||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.std, bins=5)
|
|
||||||
|
|
||||||
assert_array_almost_equal(stat1, stat2)
|
|
||||||
assert_array_almost_equal(binx1, binx2)
|
|
||||||
assert_array_almost_equal(biny1, biny2)
|
|
||||||
|
|
||||||
def test_2d_median(self):
|
|
||||||
x = self.x
|
|
||||||
y = self.y
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'median', bins=5)
|
|
||||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.median, bins=5)
|
|
||||||
|
|
||||||
assert_array_almost_equal(stat1, stat2)
|
|
||||||
assert_array_almost_equal(binx1, binx2)
|
|
||||||
assert_array_almost_equal(biny1, biny2)
|
|
||||||
|
|
||||||
def test_2d_bincode(self):
|
|
||||||
x = self.x[:20]
|
|
||||||
y = self.y[:20]
|
|
||||||
v = self.v[:20]
|
|
||||||
|
|
||||||
count1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'count', bins=3)
|
|
||||||
bc2 = np.array([17, 11, 6, 16, 11, 17, 18, 17, 17, 7, 6, 18, 16,
|
|
||||||
6, 11, 16, 6, 6, 11, 8])
|
|
||||||
|
|
||||||
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
|
||||||
|
|
||||||
assert_array_almost_equal(bc, bc2)
|
|
||||||
count1adj = count1[count1.nonzero()]
|
|
||||||
assert_array_almost_equal(bcount, count1adj)
|
|
||||||
|
|
||||||
def test_dd_count(self):
|
|
||||||
X = self.X
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
|
|
||||||
count2, edges2 = np.histogramdd(X, bins=3)
|
|
||||||
|
|
||||||
assert_array_almost_equal(count1, count2)
|
|
||||||
assert_array_almost_equal(edges1, edges2)
|
|
||||||
|
|
||||||
def test_dd_sum(self):
|
|
||||||
X = self.X
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
sum1, edges1, bc = binned_statistic_dd(X, v, 'sum', bins=3)
|
|
||||||
sum2, edges2 = np.histogramdd(X, bins=3, weights=v)
|
|
||||||
|
|
||||||
assert_array_almost_equal(sum1, sum2)
|
|
||||||
assert_array_almost_equal(edges1, edges2)
|
|
||||||
|
|
||||||
def test_dd_mean(self):
|
|
||||||
X = self.X
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'mean', bins=3)
|
|
||||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.mean, bins=3)
|
|
||||||
|
|
||||||
assert_array_almost_equal(stat1, stat2)
|
|
||||||
assert_array_almost_equal(edges1, edges2)
|
|
||||||
|
|
||||||
def test_dd_std(self):
|
|
||||||
X = self.X
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'std', bins=3)
|
|
||||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.std, bins=3)
|
|
||||||
|
|
||||||
assert_array_almost_equal(stat1, stat2)
|
|
||||||
assert_array_almost_equal(edges1, edges2)
|
|
||||||
|
|
||||||
def test_dd_median(self):
|
|
||||||
X = self.X
|
|
||||||
v = self.v
|
|
||||||
|
|
||||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'median', bins=3)
|
|
||||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.median, bins=3)
|
|
||||||
|
|
||||||
assert_array_almost_equal(stat1, stat2)
|
|
||||||
assert_array_almost_equal(edges1, edges2)
|
|
||||||
|
|
||||||
def test_dd_bincode(self):
|
|
||||||
X = self.X[:20]
|
|
||||||
v = self.v[:20]
|
|
||||||
|
|
||||||
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
|
|
||||||
bc2 = np.array([63, 33, 86, 83, 88, 67, 57, 33, 42, 41, 82, 83, 92,
|
|
||||||
32, 36, 91, 43, 87, 81, 81])
|
|
||||||
|
|
||||||
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
|
||||||
|
|
||||||
assert_array_almost_equal(bc, bc2)
|
|
||||||
count1adj = count1[count1.nonzero()]
|
|
||||||
assert_array_almost_equal(bcount, count1adj)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run_module_suite()
|
|
@ -1,202 +0,0 @@
|
|||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from numpy.testing import (run_module_suite, assert_equal, assert_array_equal,
|
|
||||||
assert_array_almost_equal, assert_approx_equal, assert_raises,
|
|
||||||
assert_allclose)
|
|
||||||
from scipy.special import xlogy
|
|
||||||
from scipy.stats.contingency import margins, expected_freq, chi2_contingency
|
|
||||||
|
|
||||||
|
|
||||||
def test_margins():
|
|
||||||
a = np.array([1])
|
|
||||||
m = margins(a)
|
|
||||||
assert_equal(len(m), 1)
|
|
||||||
m0 = m[0]
|
|
||||||
assert_array_equal(m0, np.array([1]))
|
|
||||||
|
|
||||||
a = np.array([[1]])
|
|
||||||
m0, m1 = margins(a)
|
|
||||||
expected0 = np.array([[1]])
|
|
||||||
expected1 = np.array([[1]])
|
|
||||||
assert_array_equal(m0, expected0)
|
|
||||||
assert_array_equal(m1, expected1)
|
|
||||||
|
|
||||||
a = np.arange(12).reshape(2, 6)
|
|
||||||
m0, m1 = margins(a)
|
|
||||||
expected0 = np.array([[15], [51]])
|
|
||||||
expected1 = np.array([[6, 8, 10, 12, 14, 16]])
|
|
||||||
assert_array_equal(m0, expected0)
|
|
||||||
assert_array_equal(m1, expected1)
|
|
||||||
|
|
||||||
a = np.arange(24).reshape(2, 3, 4)
|
|
||||||
m0, m1, m2 = margins(a)
|
|
||||||
expected0 = np.array([[[66]], [[210]]])
|
|
||||||
expected1 = np.array([[[60], [92], [124]]])
|
|
||||||
expected2 = np.array([[[60, 66, 72, 78]]])
|
|
||||||
assert_array_equal(m0, expected0)
|
|
||||||
assert_array_equal(m1, expected1)
|
|
||||||
assert_array_equal(m2, expected2)
|
|
||||||
|
|
||||||
|
|
||||||
def test_expected_freq():
|
|
||||||
assert_array_equal(expected_freq([1]), np.array([1.0]))
|
|
||||||
|
|
||||||
observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
|
|
||||||
e = expected_freq(observed)
|
|
||||||
assert_array_equal(e, np.ones_like(observed))
|
|
||||||
|
|
||||||
observed = np.array([[10, 10, 20], [20, 20, 20]])
|
|
||||||
e = expected_freq(observed)
|
|
||||||
correct = np.array([[12., 12., 16.], [18., 18., 24.]])
|
|
||||||
assert_array_almost_equal(e, correct)
|
|
||||||
|
|
||||||
|
|
||||||
def test_chi2_contingency_trivial():
|
|
||||||
# Some very simple tests for chi2_contingency.
|
|
||||||
|
|
||||||
# A trivial case
|
|
||||||
obs = np.array([[1, 2], [1, 2]])
|
|
||||||
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
|
||||||
assert_equal(chi2, 0.0)
|
|
||||||
assert_equal(p, 1.0)
|
|
||||||
assert_equal(dof, 1)
|
|
||||||
assert_array_equal(obs, expected)
|
|
||||||
|
|
||||||
# A *really* trivial case: 1-D data.
|
|
||||||
obs = np.array([1, 2, 3])
|
|
||||||
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
|
||||||
assert_equal(chi2, 0.0)
|
|
||||||
assert_equal(p, 1.0)
|
|
||||||
assert_equal(dof, 0)
|
|
||||||
assert_array_equal(obs, expected)
|
|
||||||
|
|
||||||
|
|
||||||
def test_chi2_contingency_R():
|
|
||||||
# Some test cases that were computed independently, using R.
|
|
||||||
|
|
||||||
Rcode = \
|
|
||||||
"""
|
|
||||||
# Data vector.
|
|
||||||
data <- c(
|
|
||||||
12, 34, 23, 4, 47, 11,
|
|
||||||
35, 31, 11, 34, 10, 18,
|
|
||||||
12, 32, 9, 18, 13, 19,
|
|
||||||
12, 12, 14, 9, 33, 25
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create factor tags:r=rows, c=columns, t=tiers
|
|
||||||
r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
|
|
||||||
c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
|
|
||||||
t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
|
|
||||||
|
|
||||||
# 3-way Chi squared test of independence
|
|
||||||
s = summary(xtabs(data~r+c+t))
|
|
||||||
print(s)
|
|
||||||
"""
|
|
||||||
Routput = \
|
|
||||||
"""
|
|
||||||
Call: xtabs(formula = data ~ r + c + t)
|
|
||||||
Number of cases in table: 478
|
|
||||||
Number of factors: 3
|
|
||||||
Test for independence of all factors:
|
|
||||||
Chisq = 102.17, df = 17, p-value = 3.514e-14
|
|
||||||
"""
|
|
||||||
obs = np.array(
|
|
||||||
[[[12, 34, 23],
|
|
||||||
[35, 31, 11],
|
|
||||||
[12, 32, 9],
|
|
||||||
[12, 12, 14]],
|
|
||||||
[[4, 47, 11],
|
|
||||||
[34, 10, 18],
|
|
||||||
[18, 13, 19],
|
|
||||||
[9, 33, 25]]])
|
|
||||||
chi2, p, dof, expected = chi2_contingency(obs)
|
|
||||||
assert_approx_equal(chi2, 102.17, significant=5)
|
|
||||||
assert_approx_equal(p, 3.514e-14, significant=4)
|
|
||||||
assert_equal(dof, 17)
|
|
||||||
|
|
||||||
Rcode = \
|
|
||||||
"""
|
|
||||||
# Data vector.
|
|
||||||
data <- c(
|
|
||||||
#
|
|
||||||
12, 17,
|
|
||||||
11, 16,
|
|
||||||
#
|
|
||||||
11, 12,
|
|
||||||
15, 16,
|
|
||||||
#
|
|
||||||
23, 15,
|
|
||||||
30, 22,
|
|
||||||
#
|
|
||||||
14, 17,
|
|
||||||
15, 16
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
|
|
||||||
r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
|
|
||||||
c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
|
|
||||||
d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
|
|
||||||
t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
|
|
||||||
|
|
||||||
# 4-way Chi squared test of independence
|
|
||||||
s = summary(xtabs(data~r+c+d+t))
|
|
||||||
print(s)
|
|
||||||
"""
|
|
||||||
Routput = \
|
|
||||||
"""
|
|
||||||
Call: xtabs(formula = data ~ r + c + d + t)
|
|
||||||
Number of cases in table: 262
|
|
||||||
Number of factors: 4
|
|
||||||
Test for independence of all factors:
|
|
||||||
Chisq = 8.758, df = 11, p-value = 0.6442
|
|
||||||
"""
|
|
||||||
obs = np.array(
|
|
||||||
[[[[12, 17],
|
|
||||||
[11, 16]],
|
|
||||||
[[11, 12],
|
|
||||||
[15, 16]]],
|
|
||||||
[[[23, 15],
|
|
||||||
[30, 22]],
|
|
||||||
[[14, 17],
|
|
||||||
[15, 16]]]])
|
|
||||||
chi2, p, dof, expected = chi2_contingency(obs)
|
|
||||||
assert_approx_equal(chi2, 8.758, significant=4)
|
|
||||||
assert_approx_equal(p, 0.6442, significant=4)
|
|
||||||
assert_equal(dof, 11)
|
|
||||||
|
|
||||||
|
|
||||||
def test_chi2_contingency_g():
|
|
||||||
c = np.array([[15, 60], [15, 90]])
|
|
||||||
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False)
|
|
||||||
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
|
||||||
|
|
||||||
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True)
|
|
||||||
c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
|
|
||||||
assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
|
|
||||||
|
|
||||||
c = np.array([[10, 12, 10], [12, 10, 10]])
|
|
||||||
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
|
|
||||||
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
|
||||||
|
|
||||||
|
|
||||||
def test_chi2_contingency_bad_args():
|
|
||||||
# Test that "bad" inputs raise a ValueError.
|
|
||||||
|
|
||||||
# Negative value in the array of observed frequencies.
|
|
||||||
obs = np.array([[-1, 10], [1, 2]])
|
|
||||||
assert_raises(ValueError, chi2_contingency, obs)
|
|
||||||
|
|
||||||
# The zeros in this will result in zeros in the array
|
|
||||||
# of expected frequencies.
|
|
||||||
obs = np.array([[0, 1], [0, 1]])
|
|
||||||
assert_raises(ValueError, chi2_contingency, obs)
|
|
||||||
|
|
||||||
# A degenerate case: `observed` has size 0.
|
|
||||||
obs = np.empty((0, 8))
|
|
||||||
assert_raises(ValueError, chi2_contingency, obs)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run_module_suite()
|
|
File diff suppressed because it is too large
Load Diff
@ -1,202 +0,0 @@
|
|||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
from wafo import stats
|
|
||||||
import numpy as np
|
|
||||||
from numpy.testing import assert_almost_equal, assert_, assert_raises, \
|
|
||||||
assert_array_almost_equal, assert_array_almost_equal_nulp, run_module_suite
|
|
||||||
|
|
||||||
|
|
||||||
def test_kde_1d():
|
|
||||||
#some basic tests comparing to normal distribution
|
|
||||||
np.random.seed(8765678)
|
|
||||||
n_basesample = 500
|
|
||||||
xn = np.random.randn(n_basesample)
|
|
||||||
xnmean = xn.mean()
|
|
||||||
xnstd = xn.std(ddof=1)
|
|
||||||
|
|
||||||
# get kde for original sample
|
|
||||||
gkde = stats.gaussian_kde(xn)
|
|
||||||
|
|
||||||
# evaluate the density function for the kde for some points
|
|
||||||
xs = np.linspace(-7,7,501)
|
|
||||||
kdepdf = gkde.evaluate(xs)
|
|
||||||
normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
|
|
||||||
intervall = xs[1] - xs[0]
|
|
||||||
|
|
||||||
assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
|
|
||||||
prob1 = gkde.integrate_box_1d(xnmean, np.inf)
|
|
||||||
prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
|
|
||||||
assert_almost_equal(prob1, 0.5, decimal=1)
|
|
||||||
assert_almost_equal(prob2, 0.5, decimal=1)
|
|
||||||
assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
|
|
||||||
assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
|
|
||||||
|
|
||||||
assert_almost_equal(gkde.integrate_kde(gkde),
|
|
||||||
(kdepdf**2).sum()*intervall, decimal=2)
|
|
||||||
assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
|
|
||||||
(kdepdf*normpdf).sum()*intervall, decimal=2)
|
|
||||||
|
|
||||||
|
|
||||||
def test_kde_bandwidth_method():
|
|
||||||
def scotts_factor(kde_obj):
|
|
||||||
"""Same as default, just check that it works."""
|
|
||||||
return np.power(kde_obj.n, -1./(kde_obj.d+4))
|
|
||||||
|
|
||||||
np.random.seed(8765678)
|
|
||||||
n_basesample = 50
|
|
||||||
xn = np.random.randn(n_basesample)
|
|
||||||
|
|
||||||
# Default
|
|
||||||
gkde = stats.gaussian_kde(xn)
|
|
||||||
# Supply a callable
|
|
||||||
gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
|
|
||||||
# Supply a scalar
|
|
||||||
gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
|
|
||||||
|
|
||||||
xs = np.linspace(-7,7,51)
|
|
||||||
kdepdf = gkde.evaluate(xs)
|
|
||||||
kdepdf2 = gkde2.evaluate(xs)
|
|
||||||
assert_almost_equal(kdepdf, kdepdf2)
|
|
||||||
kdepdf3 = gkde3.evaluate(xs)
|
|
||||||
assert_almost_equal(kdepdf, kdepdf3)
|
|
||||||
|
|
||||||
assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
|
|
||||||
|
|
||||||
|
|
||||||
# Subclasses that should stay working (extracted from various sources).
|
|
||||||
# Unfortunately the earlier design of gaussian_kde made it necessary for users
|
|
||||||
# to create these kinds of subclasses, or call _compute_covariance() directly.
|
|
||||||
|
|
||||||
class _kde_subclass1(stats.gaussian_kde):
|
|
||||||
def __init__(self, dataset):
|
|
||||||
self.dataset = np.atleast_2d(dataset)
|
|
||||||
self.d, self.n = self.dataset.shape
|
|
||||||
self.covariance_factor = self.scotts_factor
|
|
||||||
self._compute_covariance()
|
|
||||||
|
|
||||||
|
|
||||||
class _kde_subclass2(stats.gaussian_kde):
|
|
||||||
def __init__(self, dataset):
|
|
||||||
self.covariance_factor = self.scotts_factor
|
|
||||||
super(_kde_subclass2, self).__init__(dataset)
|
|
||||||
|
|
||||||
|
|
||||||
class _kde_subclass3(stats.gaussian_kde):
|
|
||||||
def __init__(self, dataset, covariance):
|
|
||||||
self.covariance = covariance
|
|
||||||
stats.gaussian_kde.__init__(self, dataset)
|
|
||||||
|
|
||||||
def _compute_covariance(self):
|
|
||||||
self.inv_cov = np.linalg.inv(self.covariance)
|
|
||||||
self._norm_factor = np.sqrt(np.linalg.det(2*np.pi * self.covariance)) \
|
|
||||||
* self.n
|
|
||||||
|
|
||||||
|
|
||||||
class _kde_subclass4(stats.gaussian_kde):
|
|
||||||
def covariance_factor(self):
|
|
||||||
return 0.5 * self.silverman_factor()
|
|
||||||
|
|
||||||
|
|
||||||
def test_gaussian_kde_subclassing():
|
|
||||||
x1 = np.array([-7, -5, 1, 4, 5], dtype=np.float)
|
|
||||||
xs = np.linspace(-10, 10, num=50)
|
|
||||||
|
|
||||||
# gaussian_kde itself
|
|
||||||
kde = stats.gaussian_kde(x1)
|
|
||||||
ys = kde(xs)
|
|
||||||
|
|
||||||
# subclass 1
|
|
||||||
kde1 = _kde_subclass1(x1)
|
|
||||||
y1 = kde1(xs)
|
|
||||||
assert_array_almost_equal_nulp(ys, y1, nulp=10)
|
|
||||||
|
|
||||||
# subclass 2
|
|
||||||
kde2 = _kde_subclass2(x1)
|
|
||||||
y2 = kde2(xs)
|
|
||||||
assert_array_almost_equal_nulp(ys, y2, nulp=10)
|
|
||||||
|
|
||||||
# subclass 3
|
|
||||||
kde3 = _kde_subclass3(x1, kde.covariance)
|
|
||||||
y3 = kde3(xs)
|
|
||||||
assert_array_almost_equal_nulp(ys, y3, nulp=10)
|
|
||||||
|
|
||||||
# subclass 4
|
|
||||||
kde4 = _kde_subclass4(x1)
|
|
||||||
y4 = kde4(x1)
|
|
||||||
y_expected = [0.06292987, 0.06346938, 0.05860291, 0.08657652, 0.07904017]
|
|
||||||
|
|
||||||
assert_array_almost_equal(y_expected, y4, decimal=6)
|
|
||||||
|
|
||||||
# Not a subclass, but check for use of _compute_covariance()
|
|
||||||
kde5 = kde
|
|
||||||
kde5.covariance_factor = lambda: kde.factor
|
|
||||||
kde5._compute_covariance()
|
|
||||||
y5 = kde5(xs)
|
|
||||||
assert_array_almost_equal_nulp(ys, y5, nulp=10)
|
|
||||||
|
|
||||||
|
|
||||||
def test_gaussian_kde_covariance_caching():
|
|
||||||
x1 = np.array([-7, -5, 1, 4, 5], dtype=np.float)
|
|
||||||
xs = np.linspace(-10, 10, num=5)
|
|
||||||
# These expected values are from scipy 0.10, before some changes to
|
|
||||||
# gaussian_kde. They were not compared with any external reference.
|
|
||||||
y_expected = [0.02463386, 0.04689208, 0.05395444, 0.05337754, 0.01664475]
|
|
||||||
|
|
||||||
# Set the bandwidth, then reset it to the default.
|
|
||||||
kde = stats.gaussian_kde(x1)
|
|
||||||
kde.set_bandwidth(bw_method=0.5)
|
|
||||||
kde.set_bandwidth(bw_method='scott')
|
|
||||||
y2 = kde(xs)
|
|
||||||
|
|
||||||
assert_array_almost_equal(y_expected, y2, decimal=7)
|
|
||||||
|
|
||||||
|
|
||||||
def test_gaussian_kde_monkeypatch():
|
|
||||||
"""Ugly, but people may rely on this. See scipy pull request 123,
|
|
||||||
specifically the linked ML thread "Width of the Gaussian in stats.kde".
|
|
||||||
If it is necessary to break this later on, that is to be discussed on ML.
|
|
||||||
"""
|
|
||||||
x1 = np.array([-7, -5, 1, 4, 5], dtype=np.float)
|
|
||||||
xs = np.linspace(-10, 10, num=50)
|
|
||||||
|
|
||||||
# The old monkeypatched version to get at Silverman's Rule.
|
|
||||||
kde = stats.gaussian_kde(x1)
|
|
||||||
kde.covariance_factor = kde.silverman_factor
|
|
||||||
kde._compute_covariance()
|
|
||||||
y1 = kde(xs)
|
|
||||||
|
|
||||||
# The new saner version.
|
|
||||||
kde2 = stats.gaussian_kde(x1, bw_method='silverman')
|
|
||||||
y2 = kde2(xs)
|
|
||||||
|
|
||||||
assert_array_almost_equal_nulp(y1, y2, nulp=10)
|
|
||||||
|
|
||||||
|
|
||||||
def test_kde_integer_input():
|
|
||||||
"""Regression test for #1181."""
|
|
||||||
x1 = np.arange(5)
|
|
||||||
kde = stats.gaussian_kde(x1)
|
|
||||||
y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869, 0.13480721]
|
|
||||||
assert_array_almost_equal(kde(x1), y_expected, decimal=6)
|
|
||||||
|
|
||||||
|
|
||||||
def test_pdf_logpdf():
|
|
||||||
np.random.seed(1)
|
|
||||||
n_basesample = 50
|
|
||||||
xn = np.random.randn(n_basesample)
|
|
||||||
|
|
||||||
# Default
|
|
||||||
gkde = stats.gaussian_kde(xn)
|
|
||||||
|
|
||||||
xs = np.linspace(-15, 12, 25)
|
|
||||||
pdf = gkde.evaluate(xs)
|
|
||||||
pdf2 = gkde.pdf(xs)
|
|
||||||
assert_almost_equal(pdf, pdf2, decimal=12)
|
|
||||||
|
|
||||||
logpdf = np.log(pdf)
|
|
||||||
logpdf2 = gkde.logpdf(xs)
|
|
||||||
assert_almost_equal(logpdf, logpdf2, decimal=12)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run_module_suite()
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,107 +0,0 @@
|
|||||||
# pylint: disable-msg=W0611, W0612, W0511,R0201
|
|
||||||
"""Tests suite for maskedArray statistics.
|
|
||||||
|
|
||||||
:author: Pierre Gerard-Marchant
|
|
||||||
:contact: pierregm_at_uga_dot_edu
|
|
||||||
"""
|
|
||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
__author__ = "Pierre GF Gerard-Marchant ($Author: backtopop $)"
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
import numpy.ma as ma
|
|
||||||
|
|
||||||
import wafo.stats.mstats as ms
|
|
||||||
#import wafo.stats.mmorestats as mms
|
|
||||||
|
|
||||||
from numpy.testing import TestCase, run_module_suite, assert_equal, \
|
|
||||||
assert_almost_equal, assert_
|
|
||||||
|
|
||||||
|
|
||||||
class TestMisc(TestCase):
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
TestCase.__init__(self, *args, **kwargs)
|
|
||||||
|
|
||||||
def test_mjci(self):
|
|
||||||
"Tests the Marits-Jarrett estimator"
|
|
||||||
data = ma.array([77, 87, 88,114,151,210,219,246,253,262,
|
|
||||||
296,299,306,376,428,515,666,1310,2611])
|
|
||||||
assert_almost_equal(ms.mjci(data),[55.76819,45.84028,198.87875],5)
|
|
||||||
|
|
||||||
def test_trimmedmeanci(self):
|
|
||||||
"Tests the confidence intervals of the trimmed mean."
|
|
||||||
data = ma.array([545,555,558,572,575,576,578,580,
|
|
||||||
594,605,635,651,653,661,666])
|
|
||||||
assert_almost_equal(ms.trimmed_mean(data,0.2), 596.2, 1)
|
|
||||||
assert_equal(np.round(ms.trimmed_mean_ci(data,(0.2,0.2)),1),
|
|
||||||
[561.8, 630.6])
|
|
||||||
|
|
||||||
def test_idealfourths(self):
|
|
||||||
"Tests ideal-fourths"
|
|
||||||
test = np.arange(100)
|
|
||||||
assert_almost_equal(np.asarray(ms.idealfourths(test)),
|
|
||||||
[24.416667,74.583333],6)
|
|
||||||
test_2D = test.repeat(3).reshape(-1,3)
|
|
||||||
assert_almost_equal(ms.idealfourths(test_2D, axis=0),
|
|
||||||
[[24.416667,24.416667,24.416667],
|
|
||||||
[74.583333,74.583333,74.583333]],6)
|
|
||||||
assert_almost_equal(ms.idealfourths(test_2D, axis=1),
|
|
||||||
test.repeat(2).reshape(-1,2))
|
|
||||||
test = [0,0]
|
|
||||||
_result = ms.idealfourths(test)
|
|
||||||
assert_(np.isnan(_result).all())
|
|
||||||
|
|
||||||
#..............................................................................
|
|
||||||
|
|
||||||
|
|
||||||
class TestQuantiles(TestCase):
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
TestCase.__init__(self, *args, **kwargs)
|
|
||||||
|
|
||||||
def test_hdquantiles(self):
|
|
||||||
data = [0.706560797,0.727229578,0.990399276,0.927065621,0.158953014,
|
|
||||||
0.887764025,0.239407086,0.349638551,0.972791145,0.149789972,
|
|
||||||
0.936947700,0.132359948,0.046041972,0.641675031,0.945530547,
|
|
||||||
0.224218684,0.771450991,0.820257774,0.336458052,0.589113496,
|
|
||||||
0.509736129,0.696838829,0.491323573,0.622767425,0.775189248,
|
|
||||||
0.641461450,0.118455200,0.773029450,0.319280007,0.752229111,
|
|
||||||
0.047841438,0.466295911,0.583850781,0.840581845,0.550086491,
|
|
||||||
0.466470062,0.504765074,0.226855960,0.362641207,0.891620942,
|
|
||||||
0.127898691,0.490094097,0.044882048,0.041441695,0.317976349,
|
|
||||||
0.504135618,0.567353033,0.434617473,0.636243375,0.231803616,
|
|
||||||
0.230154113,0.160011327,0.819464108,0.854706985,0.438809221,
|
|
||||||
0.487427267,0.786907310,0.408367937,0.405534192,0.250444460,
|
|
||||||
0.995309248,0.144389588,0.739947527,0.953543606,0.680051621,
|
|
||||||
0.388382017,0.863530727,0.006514031,0.118007779,0.924024803,
|
|
||||||
0.384236354,0.893687694,0.626534881,0.473051932,0.750134705,
|
|
||||||
0.241843555,0.432947602,0.689538104,0.136934797,0.150206859,
|
|
||||||
0.474335206,0.907775349,0.525869295,0.189184225,0.854284286,
|
|
||||||
0.831089744,0.251637345,0.587038213,0.254475554,0.237781276,
|
|
||||||
0.827928620,0.480283781,0.594514455,0.213641488,0.024194386,
|
|
||||||
0.536668589,0.699497811,0.892804071,0.093835427,0.731107772]
|
|
||||||
#
|
|
||||||
assert_almost_equal(ms.hdquantiles(data,[0., 1.]),
|
|
||||||
[0.006514031, 0.995309248])
|
|
||||||
hdq = ms.hdquantiles(data,[0.25, 0.5, 0.75])
|
|
||||||
assert_almost_equal(hdq, [0.253210762, 0.512847491, 0.762232442,])
|
|
||||||
hdq = ms.hdquantiles_sd(data,[0.25, 0.5, 0.75])
|
|
||||||
assert_almost_equal(hdq, [0.03786954, 0.03805389, 0.03800152,], 4)
|
|
||||||
#
|
|
||||||
data = np.array(data).reshape(10,10)
|
|
||||||
hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0)
|
|
||||||
assert_almost_equal(hdq[:,0], ms.hdquantiles(data[:,0],[0.25,0.5,0.75]))
|
|
||||||
assert_almost_equal(hdq[:,-1], ms.hdquantiles(data[:,-1],[0.25,0.5,0.75]))
|
|
||||||
hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0,var=True)
|
|
||||||
assert_almost_equal(hdq[...,0],
|
|
||||||
ms.hdquantiles(data[:,0],[0.25,0.5,0.75],var=True))
|
|
||||||
assert_almost_equal(hdq[...,-1],
|
|
||||||
ms.hdquantiles(data[:,-1],[0.25,0.5,0.75], var=True))
|
|
||||||
|
|
||||||
|
|
||||||
###############################################################################
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run_module_suite()
|
|
@ -1,485 +0,0 @@
|
|||||||
"""
|
|
||||||
Test functions for multivariate normal distributions.
|
|
||||||
|
|
||||||
"""
|
|
||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
from numpy.testing import (
|
|
||||||
assert_allclose,
|
|
||||||
assert_almost_equal,
|
|
||||||
assert_array_almost_equal,
|
|
||||||
assert_equal,
|
|
||||||
assert_raises,
|
|
||||||
run_module_suite,
|
|
||||||
)
|
|
||||||
|
|
||||||
import numpy
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
import scipy.linalg
|
|
||||||
from wafo.stats._multivariate import _PSD, _lnB
|
|
||||||
from wafo.stats import multivariate_normal
|
|
||||||
from wafo.stats import dirichlet, beta
|
|
||||||
from wafo.stats import norm
|
|
||||||
|
|
||||||
from scipy.integrate import romb
|
|
||||||
|
|
||||||
|
|
||||||
def test_input_shape():
|
|
||||||
mu = np.arange(3)
|
|
||||||
cov = np.identity(2)
|
|
||||||
assert_raises(ValueError, multivariate_normal.pdf, (0, 1), mu, cov)
|
|
||||||
assert_raises(ValueError, multivariate_normal.pdf, (0, 1, 2), mu, cov)
|
|
||||||
|
|
||||||
|
|
||||||
def test_scalar_values():
|
|
||||||
np.random.seed(1234)
|
|
||||||
|
|
||||||
# When evaluated on scalar data, the pdf should return a scalar
|
|
||||||
x, mean, cov = 1.5, 1.7, 2.5
|
|
||||||
pdf = multivariate_normal.pdf(x, mean, cov)
|
|
||||||
assert_equal(pdf.ndim, 0)
|
|
||||||
|
|
||||||
# When evaluated on a single vector, the pdf should return a scalar
|
|
||||||
x = np.random.randn(5)
|
|
||||||
mean = np.random.randn(5)
|
|
||||||
cov = np.abs(np.random.randn(5)) # Diagonal values for cov. matrix
|
|
||||||
pdf = multivariate_normal.pdf(x, mean, cov)
|
|
||||||
assert_equal(pdf.ndim, 0)
|
|
||||||
|
|
||||||
|
|
||||||
def test_logpdf():
|
|
||||||
# Check that the log of the pdf is in fact the logpdf
|
|
||||||
np.random.seed(1234)
|
|
||||||
x = np.random.randn(5)
|
|
||||||
mean = np.random.randn(5)
|
|
||||||
cov = np.abs(np.random.randn(5))
|
|
||||||
d1 = multivariate_normal.logpdf(x, mean, cov)
|
|
||||||
d2 = multivariate_normal.pdf(x, mean, cov)
|
|
||||||
assert_allclose(d1, np.log(d2))
|
|
||||||
|
|
||||||
|
|
||||||
def test_rank():
|
|
||||||
# Check that the rank is detected correctly.
|
|
||||||
np.random.seed(1234)
|
|
||||||
n = 4
|
|
||||||
mean = np.random.randn(n)
|
|
||||||
for expected_rank in range(1, n + 1):
|
|
||||||
s = np.random.randn(n, expected_rank)
|
|
||||||
cov = np.dot(s, s.T)
|
|
||||||
distn = multivariate_normal(mean, cov, allow_singular=True)
|
|
||||||
assert_equal(distn.cov_info.rank, expected_rank)
|
|
||||||
|
|
||||||
|
|
||||||
def _sample_orthonormal_matrix(n):
|
|
||||||
M = np.random.randn(n, n)
|
|
||||||
u, s, v = scipy.linalg.svd(M)
|
|
||||||
return u
|
|
||||||
|
|
||||||
|
|
||||||
def test_degenerate_distributions():
|
|
||||||
for n in range(1, 5):
|
|
||||||
x = np.random.randn(n)
|
|
||||||
for k in range(1, n + 1):
|
|
||||||
# Sample a small covariance matrix.
|
|
||||||
s = np.random.randn(k, k)
|
|
||||||
cov_kk = np.dot(s, s.T)
|
|
||||||
|
|
||||||
# Embed the small covariance matrix into a larger low rank matrix.
|
|
||||||
cov_nn = np.zeros((n, n))
|
|
||||||
cov_nn[:k, :k] = cov_kk
|
|
||||||
|
|
||||||
# Define a rotation of the larger low rank matrix.
|
|
||||||
u = _sample_orthonormal_matrix(n)
|
|
||||||
cov_rr = np.dot(u, np.dot(cov_nn, u.T))
|
|
||||||
y = np.dot(u, x)
|
|
||||||
|
|
||||||
# Check some identities.
|
|
||||||
distn_kk = multivariate_normal(np.zeros(k), cov_kk,
|
|
||||||
allow_singular=True)
|
|
||||||
distn_nn = multivariate_normal(np.zeros(n), cov_nn,
|
|
||||||
allow_singular=True)
|
|
||||||
distn_rr = multivariate_normal(np.zeros(n), cov_rr,
|
|
||||||
allow_singular=True)
|
|
||||||
assert_equal(distn_kk.cov_info.rank, k)
|
|
||||||
assert_equal(distn_nn.cov_info.rank, k)
|
|
||||||
assert_equal(distn_rr.cov_info.rank, k)
|
|
||||||
pdf_kk = distn_kk.pdf(x[:k])
|
|
||||||
pdf_nn = distn_nn.pdf(x)
|
|
||||||
pdf_rr = distn_rr.pdf(y)
|
|
||||||
assert_allclose(pdf_kk, pdf_nn)
|
|
||||||
assert_allclose(pdf_kk, pdf_rr)
|
|
||||||
logpdf_kk = distn_kk.logpdf(x[:k])
|
|
||||||
logpdf_nn = distn_nn.logpdf(x)
|
|
||||||
logpdf_rr = distn_rr.logpdf(y)
|
|
||||||
assert_allclose(logpdf_kk, logpdf_nn)
|
|
||||||
assert_allclose(logpdf_kk, logpdf_rr)
|
|
||||||
|
|
||||||
|
|
||||||
def test_large_pseudo_determinant():
|
|
||||||
# Check that large pseudo-determinants are handled appropriately.
|
|
||||||
|
|
||||||
# Construct a singular diagonal covariance matrix
|
|
||||||
# whose pseudo determinant overflows double precision.
|
|
||||||
large_total_log = 1000.0
|
|
||||||
npos = 100
|
|
||||||
nzero = 2
|
|
||||||
large_entry = np.exp(large_total_log / npos)
|
|
||||||
n = npos + nzero
|
|
||||||
cov = np.zeros((n, n), dtype=float)
|
|
||||||
np.fill_diagonal(cov, large_entry)
|
|
||||||
cov[-nzero:, -nzero:] = 0
|
|
||||||
|
|
||||||
# Check some determinants.
|
|
||||||
assert_equal(scipy.linalg.det(cov), 0)
|
|
||||||
assert_equal(scipy.linalg.det(cov[:npos, :npos]), np.inf)
|
|
||||||
|
|
||||||
# np.linalg.slogdet is only available in numpy 1.6+
|
|
||||||
# but scipy currently supports numpy 1.5.1.
|
|
||||||
# assert_allclose(np.linalg.slogdet(cov[:npos, :npos]),
|
|
||||||
# (1, large_total_log))
|
|
||||||
|
|
||||||
# Check the pseudo-determinant.
|
|
||||||
psd = _PSD(cov)
|
|
||||||
assert_allclose(psd.log_pdet, large_total_log)
|
|
||||||
|
|
||||||
|
|
||||||
def test_broadcasting():
|
|
||||||
np.random.seed(1234)
|
|
||||||
n = 4
|
|
||||||
|
|
||||||
# Construct a random covariance matrix.
|
|
||||||
data = np.random.randn(n, n)
|
|
||||||
cov = np.dot(data, data.T)
|
|
||||||
mean = np.random.randn(n)
|
|
||||||
|
|
||||||
# Construct an ndarray which can be interpreted as
|
|
||||||
# a 2x3 array whose elements are random data vectors.
|
|
||||||
X = np.random.randn(2, 3, n)
|
|
||||||
|
|
||||||
# Check that multiple data points can be evaluated at once.
|
|
||||||
for i in range(2):
|
|
||||||
for j in range(3):
|
|
||||||
actual = multivariate_normal.pdf(X[i, j], mean, cov)
|
|
||||||
desired = multivariate_normal.pdf(X, mean, cov)[i, j]
|
|
||||||
assert_allclose(actual, desired)
|
|
||||||
|
|
||||||
|
|
||||||
def test_normal_1D():
|
|
||||||
# The probability density function for a 1D normal variable should
|
|
||||||
# agree with the standard normal distribution in scipy.stats.distributions
|
|
||||||
x = np.linspace(0, 2, 10)
|
|
||||||
mean, cov = 1.2, 0.9
|
|
||||||
scale = cov**0.5
|
|
||||||
d1 = norm.pdf(x, mean, scale)
|
|
||||||
d2 = multivariate_normal.pdf(x, mean, cov)
|
|
||||||
assert_allclose(d1, d2)
|
|
||||||
|
|
||||||
|
|
||||||
def test_marginalization():
|
|
||||||
# Integrating out one of the variables of a 2D Gaussian should
|
|
||||||
# yield a 1D Gaussian
|
|
||||||
mean = np.array([2.5, 3.5])
|
|
||||||
cov = np.array([[.5, 0.2], [0.2, .6]])
|
|
||||||
n = 2 ** 8 + 1 # Number of samples
|
|
||||||
delta = 6 / (n - 1) # Grid spacing
|
|
||||||
|
|
||||||
v = np.linspace(0, 6, n)
|
|
||||||
xv, yv = np.meshgrid(v, v)
|
|
||||||
pos = np.empty((n, n, 2))
|
|
||||||
pos[:, :, 0] = xv
|
|
||||||
pos[:, :, 1] = yv
|
|
||||||
pdf = multivariate_normal.pdf(pos, mean, cov)
|
|
||||||
|
|
||||||
# Marginalize over x and y axis
|
|
||||||
margin_x = romb(pdf, delta, axis=0)
|
|
||||||
margin_y = romb(pdf, delta, axis=1)
|
|
||||||
|
|
||||||
# Compare with standard normal distribution
|
|
||||||
gauss_x = norm.pdf(v, loc=mean[0], scale=cov[0, 0] ** 0.5)
|
|
||||||
gauss_y = norm.pdf(v, loc=mean[1], scale=cov[1, 1] ** 0.5)
|
|
||||||
assert_allclose(margin_x, gauss_x, rtol=1e-2, atol=1e-2)
|
|
||||||
assert_allclose(margin_y, gauss_y, rtol=1e-2, atol=1e-2)
|
|
||||||
|
|
||||||
|
|
||||||
def test_frozen():
|
|
||||||
# The frozen distribution should agree with the regular one
|
|
||||||
np.random.seed(1234)
|
|
||||||
x = np.random.randn(5)
|
|
||||||
mean = np.random.randn(5)
|
|
||||||
cov = np.abs(np.random.randn(5))
|
|
||||||
norm_frozen = multivariate_normal(mean, cov)
|
|
||||||
assert_allclose(norm_frozen.pdf(x), multivariate_normal.pdf(x, mean, cov))
|
|
||||||
assert_allclose(norm_frozen.logpdf(x),
|
|
||||||
multivariate_normal.logpdf(x, mean, cov))
|
|
||||||
|
|
||||||
|
|
||||||
def test_pseudodet_pinv():
|
|
||||||
# Make sure that pseudo-inverse and pseudo-det agree on cutoff
|
|
||||||
|
|
||||||
# Assemble random covariance matrix with large and small eigenvalues
|
|
||||||
np.random.seed(1234)
|
|
||||||
n = 7
|
|
||||||
x = np.random.randn(n, n)
|
|
||||||
cov = np.dot(x, x.T)
|
|
||||||
s, u = scipy.linalg.eigh(cov)
|
|
||||||
s = 0.5 * np.ones(n)
|
|
||||||
s[0] = 1.0
|
|
||||||
s[-1] = 1e-7
|
|
||||||
cov = np.dot(u, np.dot(np.diag(s), u.T))
|
|
||||||
|
|
||||||
# Set cond so that the lowest eigenvalue is below the cutoff
|
|
||||||
cond = 1e-5
|
|
||||||
psd = _PSD(cov, cond=cond)
|
|
||||||
psd_pinv = _PSD(psd.pinv, cond=cond)
|
|
||||||
|
|
||||||
# Check that the log pseudo-determinant agrees with the sum
|
|
||||||
# of the logs of all but the smallest eigenvalue
|
|
||||||
assert_allclose(psd.log_pdet, np.sum(np.log(s[:-1])))
|
|
||||||
# Check that the pseudo-determinant of the pseudo-inverse
|
|
||||||
# agrees with 1 / pseudo-determinant
|
|
||||||
assert_allclose(-psd.log_pdet, psd_pinv.log_pdet)
|
|
||||||
|
|
||||||
|
|
||||||
def test_exception_nonsquare_cov():
|
|
||||||
cov = [[1, 2, 3], [4, 5, 6]]
|
|
||||||
assert_raises(ValueError, _PSD, cov)
|
|
||||||
|
|
||||||
|
|
||||||
def test_exception_nonfinite_cov():
|
|
||||||
cov_nan = [[1, 0], [0, np.nan]]
|
|
||||||
assert_raises(ValueError, _PSD, cov_nan)
|
|
||||||
cov_inf = [[1, 0], [0, np.inf]]
|
|
||||||
assert_raises(ValueError, _PSD, cov_inf)
|
|
||||||
|
|
||||||
|
|
||||||
def test_exception_non_psd_cov():
|
|
||||||
cov = [[1, 0], [0, -1]]
|
|
||||||
assert_raises(ValueError, _PSD, cov)
|
|
||||||
|
|
||||||
|
|
||||||
def test_exception_singular_cov():
|
|
||||||
np.random.seed(1234)
|
|
||||||
x = np.random.randn(5)
|
|
||||||
mean = np.random.randn(5)
|
|
||||||
cov = np.ones((5, 5))
|
|
||||||
e = np.linalg.LinAlgError
|
|
||||||
assert_raises(e, multivariate_normal, mean, cov)
|
|
||||||
assert_raises(e, multivariate_normal.pdf, x, mean, cov)
|
|
||||||
assert_raises(e, multivariate_normal.logpdf, x, mean, cov)
|
|
||||||
|
|
||||||
|
|
||||||
def test_R_values():
|
|
||||||
# Compare the multivariate pdf with some values precomputed
|
|
||||||
# in R version 3.0.1 (2013-05-16) on Mac OS X 10.6.
|
|
||||||
|
|
||||||
# The values below were generated by the following R-script:
|
|
||||||
# > library(mnormt)
|
|
||||||
# > x <- seq(0, 2, length=5)
|
|
||||||
# > y <- 3*x - 2
|
|
||||||
# > z <- x + cos(y)
|
|
||||||
# > mu <- c(1, 3, 2)
|
|
||||||
# > Sigma <- matrix(c(1,2,0,2,5,0.5,0,0.5,3), 3, 3)
|
|
||||||
# > r_pdf <- dmnorm(cbind(x,y,z), mu, Sigma)
|
|
||||||
r_pdf = np.array([0.0002214706, 0.0013819953, 0.0049138692,
|
|
||||||
0.0103803050, 0.0140250800])
|
|
||||||
|
|
||||||
x = np.linspace(0, 2, 5)
|
|
||||||
y = 3 * x - 2
|
|
||||||
z = x + np.cos(y)
|
|
||||||
r = np.array([x, y, z]).T
|
|
||||||
|
|
||||||
mean = np.array([1, 3, 2], 'd')
|
|
||||||
cov = np.array([[1, 2, 0], [2, 5, .5], [0, .5, 3]], 'd')
|
|
||||||
|
|
||||||
pdf = multivariate_normal.pdf(r, mean, cov)
|
|
||||||
assert_allclose(pdf, r_pdf, atol=1e-10)
|
|
||||||
|
|
||||||
|
|
||||||
def test_multivariate_normal_rvs_zero_covariance():
|
|
||||||
mean = np.zeros(2)
|
|
||||||
covariance = np.zeros((2, 2))
|
|
||||||
model = multivariate_normal(mean, covariance, allow_singular=True)
|
|
||||||
sample = model.rvs()
|
|
||||||
assert_equal(sample, [0, 0])
|
|
||||||
|
|
||||||
|
|
||||||
def test_rvs_shape():
|
|
||||||
# Check that rvs parses the mean and covariance correctly, and returns
|
|
||||||
# an array of the right shape
|
|
||||||
N = 300
|
|
||||||
d = 4
|
|
||||||
sample = multivariate_normal.rvs(mean=np.zeros(d), cov=1, size=N)
|
|
||||||
assert_equal(sample.shape, (N, d))
|
|
||||||
|
|
||||||
sample = multivariate_normal.rvs(mean=None,
|
|
||||||
cov=np.array([[2, .1], [.1, 1]]),
|
|
||||||
size=N)
|
|
||||||
assert_equal(sample.shape, (N, 2))
|
|
||||||
|
|
||||||
u = multivariate_normal(mean=0, cov=1)
|
|
||||||
sample = u.rvs(N)
|
|
||||||
assert_equal(sample.shape, (N, ))
|
|
||||||
|
|
||||||
|
|
||||||
def test_large_sample():
|
|
||||||
# Generate large sample and compare sample mean and sample covariance
|
|
||||||
# with mean and covariance matrix.
|
|
||||||
|
|
||||||
np.random.seed(2846)
|
|
||||||
|
|
||||||
n = 3
|
|
||||||
mean = np.random.randn(n)
|
|
||||||
M = np.random.randn(n, n)
|
|
||||||
cov = np.dot(M, M.T)
|
|
||||||
size = 5000
|
|
||||||
|
|
||||||
sample = multivariate_normal.rvs(mean, cov, size)
|
|
||||||
|
|
||||||
assert_allclose(numpy.cov(sample.T), cov, rtol=1e-1)
|
|
||||||
assert_allclose(sample.mean(0), mean, rtol=1e-1)
|
|
||||||
|
|
||||||
|
|
||||||
def test_entropy():
|
|
||||||
np.random.seed(2846)
|
|
||||||
|
|
||||||
n = 3
|
|
||||||
mean = np.random.randn(n)
|
|
||||||
M = np.random.randn(n, n)
|
|
||||||
cov = np.dot(M, M.T)
|
|
||||||
|
|
||||||
rv = multivariate_normal(mean, cov)
|
|
||||||
|
|
||||||
# Check that frozen distribution agrees with entropy function
|
|
||||||
assert_almost_equal(rv.entropy(), multivariate_normal.entropy(mean, cov))
|
|
||||||
# Compare entropy with manually computed expression involving
|
|
||||||
# the sum of the logs of the eigenvalues of the covariance matrix
|
|
||||||
eigs = np.linalg.eig(cov)[0]
|
|
||||||
desired = 1 / 2 * (n * (np.log(2 * np.pi) + 1) + np.sum(np.log(eigs)))
|
|
||||||
assert_almost_equal(desired, rv.entropy())
|
|
||||||
|
|
||||||
|
|
||||||
def test_lnB():
|
|
||||||
alpha = np.array([1, 1, 1])
|
|
||||||
desired = .5 # e^lnB = 1/2 for [1, 1, 1]
|
|
||||||
|
|
||||||
assert_almost_equal(np.exp(_lnB(alpha)), desired)
|
|
||||||
|
|
||||||
|
|
||||||
def test_frozen_dirichlet():
|
|
||||||
np.random.seed(2846)
|
|
||||||
|
|
||||||
n = np.random.randint(1, 32)
|
|
||||||
alpha = np.random.uniform(10e-10, 100, n)
|
|
||||||
|
|
||||||
d = dirichlet(alpha)
|
|
||||||
|
|
||||||
assert_equal(d.var(), dirichlet.var(alpha))
|
|
||||||
assert_equal(d.mean(), dirichlet.mean(alpha))
|
|
||||||
assert_equal(d.entropy(), dirichlet.entropy(alpha))
|
|
||||||
num_tests = 10
|
|
||||||
for i in range(num_tests):
|
|
||||||
x = np.random.uniform(10e-10, 100, n)
|
|
||||||
x /= np.sum(x)
|
|
||||||
assert_equal(d.pdf(x[:-1]), dirichlet.pdf(x[:-1], alpha))
|
|
||||||
assert_equal(d.logpdf(x[:-1]), dirichlet.logpdf(x[:-1], alpha))
|
|
||||||
|
|
||||||
|
|
||||||
def test_simple_values():
|
|
||||||
alpha = np.array([1, 1])
|
|
||||||
d = dirichlet(alpha)
|
|
||||||
|
|
||||||
assert_almost_equal(d.mean(), 0.5)
|
|
||||||
assert_almost_equal(d.var(), 1. / 12.)
|
|
||||||
|
|
||||||
b = beta(1, 1)
|
|
||||||
assert_almost_equal(d.mean(), b.mean())
|
|
||||||
assert_almost_equal(d.var(), b.var())
|
|
||||||
|
|
||||||
|
|
||||||
def test_K_and_K_minus_1_calls_equal():
|
|
||||||
# Test that calls with K and K-1 entries yield the same results.
|
|
||||||
|
|
||||||
np.random.seed(2846)
|
|
||||||
|
|
||||||
n = np.random.randint(1, 32)
|
|
||||||
alpha = np.random.uniform(10e-10, 100, n)
|
|
||||||
|
|
||||||
d = dirichlet(alpha)
|
|
||||||
num_tests = 10
|
|
||||||
for i in range(num_tests):
|
|
||||||
x = np.random.uniform(10e-10, 100, n)
|
|
||||||
x /= np.sum(x)
|
|
||||||
assert_almost_equal(d.pdf(x[:-1]), d.pdf(x))
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_entry_calls():
|
|
||||||
# Test that calls with multiple x vectors as matrix work
|
|
||||||
|
|
||||||
np.random.seed(2846)
|
|
||||||
|
|
||||||
n = np.random.randint(1, 32)
|
|
||||||
alpha = np.random.uniform(10e-10, 100, n)
|
|
||||||
d = dirichlet(alpha)
|
|
||||||
|
|
||||||
num_tests = 10
|
|
||||||
num_multiple = 5
|
|
||||||
xm = None
|
|
||||||
for i in range(num_tests):
|
|
||||||
for m in range(num_multiple):
|
|
||||||
x = np.random.uniform(10e-10, 100, n)
|
|
||||||
x /= np.sum(x)
|
|
||||||
if xm is not None:
|
|
||||||
xm = np.vstack((xm, x))
|
|
||||||
else:
|
|
||||||
xm = x
|
|
||||||
rm = d.pdf(xm.T)
|
|
||||||
rs = None
|
|
||||||
for xs in xm:
|
|
||||||
r = d.pdf(xs)
|
|
||||||
if rs is not None:
|
|
||||||
rs = np.append(rs, r)
|
|
||||||
else:
|
|
||||||
rs = r
|
|
||||||
assert_array_almost_equal(rm, rs)
|
|
||||||
|
|
||||||
|
|
||||||
def test_2D_dirichlet_is_beta():
|
|
||||||
np.random.seed(2846)
|
|
||||||
|
|
||||||
alpha = np.random.uniform(10e-10, 100, 2)
|
|
||||||
d = dirichlet(alpha)
|
|
||||||
b = beta(alpha[0], alpha[1])
|
|
||||||
|
|
||||||
num_tests = 10
|
|
||||||
for i in range(num_tests):
|
|
||||||
x = np.random.uniform(10e-10, 100, 2)
|
|
||||||
x /= np.sum(x)
|
|
||||||
assert_almost_equal(b.pdf(x), d.pdf([x]))
|
|
||||||
|
|
||||||
assert_almost_equal(b.mean(), d.mean()[0])
|
|
||||||
assert_almost_equal(b.var(), d.var()[0])
|
|
||||||
|
|
||||||
|
|
||||||
def test_dimensions_mismatch():
|
|
||||||
# Regression test for GH #3493. Check that setting up a PDF with a mean of
|
|
||||||
# length M and a covariance matrix of size (N, N), where M != N, raises a
|
|
||||||
# ValueError with an informative error message.
|
|
||||||
|
|
||||||
mu = np.array([0.0, 0.0])
|
|
||||||
sigma = np.array([[1.0]])
|
|
||||||
|
|
||||||
assert_raises(ValueError, multivariate_normal, mu, sigma)
|
|
||||||
|
|
||||||
# A simple check that the right error message was passed along. Checking
|
|
||||||
# that the entire message is there, word for word, would be somewhat
|
|
||||||
# fragile, so we just check for the leading part.
|
|
||||||
try:
|
|
||||||
multivariate_normal(mu, sigma)
|
|
||||||
except ValueError as e:
|
|
||||||
msg = "Dimension mismatch"
|
|
||||||
assert_equal(str(e)[:len(msg)], msg)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run_module_suite()
|
|
@ -1,193 +0,0 @@
|
|||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from numpy.testing import TestCase, run_module_suite, assert_equal, \
|
|
||||||
assert_array_equal
|
|
||||||
|
|
||||||
from wafo.stats import rankdata, tiecorrect
|
|
||||||
|
|
||||||
|
|
||||||
class TestTieCorrect(TestCase):
|
|
||||||
|
|
||||||
def test_empty(self):
|
|
||||||
"""An empty array requires no correction, should return 1.0."""
|
|
||||||
ranks = np.array([], dtype=np.float64)
|
|
||||||
c = tiecorrect(ranks)
|
|
||||||
assert_equal(c, 1.0)
|
|
||||||
|
|
||||||
def test_one(self):
|
|
||||||
"""A single element requires no correction, should return 1.0."""
|
|
||||||
ranks = np.array([1.0], dtype=np.float64)
|
|
||||||
c = tiecorrect(ranks)
|
|
||||||
assert_equal(c, 1.0)
|
|
||||||
|
|
||||||
def test_no_correction(self):
|
|
||||||
"""Arrays with no ties require no correction."""
|
|
||||||
ranks = np.arange(2.0)
|
|
||||||
c = tiecorrect(ranks)
|
|
||||||
assert_equal(c, 1.0)
|
|
||||||
ranks = np.arange(3.0)
|
|
||||||
c = tiecorrect(ranks)
|
|
||||||
assert_equal(c, 1.0)
|
|
||||||
|
|
||||||
def test_basic(self):
|
|
||||||
"""Check a few basic examples of the tie correction factor."""
|
|
||||||
# One tie of two elements
|
|
||||||
ranks = np.array([1.0, 2.5, 2.5])
|
|
||||||
c = tiecorrect(ranks)
|
|
||||||
T = 2.0
|
|
||||||
N = ranks.size
|
|
||||||
expected = 1.0 - (T**3 - T) / (N**3 - N)
|
|
||||||
assert_equal(c, expected)
|
|
||||||
|
|
||||||
# One tie of two elements (same as above, but tie is not at the end)
|
|
||||||
ranks = np.array([1.5, 1.5, 3.0])
|
|
||||||
c = tiecorrect(ranks)
|
|
||||||
T = 2.0
|
|
||||||
N = ranks.size
|
|
||||||
expected = 1.0 - (T**3 - T) / (N**3 - N)
|
|
||||||
assert_equal(c, expected)
|
|
||||||
|
|
||||||
# One tie of three elements
|
|
||||||
ranks = np.array([1.0, 3.0, 3.0, 3.0])
|
|
||||||
c = tiecorrect(ranks)
|
|
||||||
T = 3.0
|
|
||||||
N = ranks.size
|
|
||||||
expected = 1.0 - (T**3 - T) / (N**3 - N)
|
|
||||||
assert_equal(c, expected)
|
|
||||||
|
|
||||||
# Two ties, lengths 2 and 3.
|
|
||||||
ranks = np.array([1.5, 1.5, 4.0, 4.0, 4.0])
|
|
||||||
c = tiecorrect(ranks)
|
|
||||||
T1 = 2.0
|
|
||||||
T2 = 3.0
|
|
||||||
N = ranks.size
|
|
||||||
expected = 1.0 - ((T1**3 - T1) + (T2**3 - T2)) / (N**3 - N)
|
|
||||||
assert_equal(c, expected)
|
|
||||||
|
|
||||||
|
|
||||||
class TestRankData(TestCase):
|
|
||||||
|
|
||||||
def test_empty(self):
|
|
||||||
"""stats.rankdata([]) should return an empty array."""
|
|
||||||
a = np.array([], dtype=np.int)
|
|
||||||
r = rankdata(a)
|
|
||||||
assert_array_equal(r, np.array([], dtype=np.float64))
|
|
||||||
r = rankdata([])
|
|
||||||
assert_array_equal(r, np.array([], dtype=np.float64))
|
|
||||||
|
|
||||||
def test_one(self):
|
|
||||||
"""Check stats.rankdata with an array of length 1."""
|
|
||||||
data = [100]
|
|
||||||
a = np.array(data, dtype=np.int)
|
|
||||||
r = rankdata(a)
|
|
||||||
assert_array_equal(r, np.array([1.0], dtype=np.float64))
|
|
||||||
r = rankdata(data)
|
|
||||||
assert_array_equal(r, np.array([1.0], dtype=np.float64))
|
|
||||||
|
|
||||||
def test_basic(self):
|
|
||||||
"""Basic tests of stats.rankdata."""
|
|
||||||
data = [100, 10, 50]
|
|
||||||
expected = np.array([3.0, 1.0, 2.0], dtype=np.float64)
|
|
||||||
a = np.array(data, dtype=np.int)
|
|
||||||
r = rankdata(a)
|
|
||||||
assert_array_equal(r, expected)
|
|
||||||
r = rankdata(data)
|
|
||||||
assert_array_equal(r, expected)
|
|
||||||
|
|
||||||
data = [40, 10, 30, 10, 50]
|
|
||||||
expected = np.array([4.0, 1.5, 3.0, 1.5, 5.0], dtype=np.float64)
|
|
||||||
a = np.array(data, dtype=np.int)
|
|
||||||
r = rankdata(a)
|
|
||||||
assert_array_equal(r, expected)
|
|
||||||
r = rankdata(data)
|
|
||||||
assert_array_equal(r, expected)
|
|
||||||
|
|
||||||
data = [20, 20, 20, 10, 10, 10]
|
|
||||||
expected = np.array([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=np.float64)
|
|
||||||
a = np.array(data, dtype=np.int)
|
|
||||||
r = rankdata(a)
|
|
||||||
assert_array_equal(r, expected)
|
|
||||||
r = rankdata(data)
|
|
||||||
assert_array_equal(r, expected)
|
|
||||||
# The docstring states explicitly that the argument is flattened.
|
|
||||||
a2d = a.reshape(2, 3)
|
|
||||||
r = rankdata(a2d)
|
|
||||||
assert_array_equal(r, expected)
|
|
||||||
|
|
||||||
def test_large_int(self):
|
|
||||||
data = np.array([2**60, 2**60+1], dtype=np.uint64)
|
|
||||||
r = rankdata(data)
|
|
||||||
assert_array_equal(r, [1.0, 2.0])
|
|
||||||
|
|
||||||
data = np.array([2**60, 2**60+1], dtype=np.int64)
|
|
||||||
r = rankdata(data)
|
|
||||||
assert_array_equal(r, [1.0, 2.0])
|
|
||||||
|
|
||||||
data = np.array([2**60, -2**60+1], dtype=np.int64)
|
|
||||||
r = rankdata(data)
|
|
||||||
assert_array_equal(r, [2.0, 1.0])
|
|
||||||
|
|
||||||
def test_big_tie(self):
|
|
||||||
for n in [10000, 100000, 1000000]:
|
|
||||||
data = np.ones(n, dtype=int)
|
|
||||||
r = rankdata(data)
|
|
||||||
expected_rank = 0.5 * (n + 1)
|
|
||||||
assert_array_equal(r, expected_rank * data,
|
|
||||||
"test failed with n=%d" % n)
|
|
||||||
|
|
||||||
|
|
||||||
_cases = (
|
|
||||||
# values, method, expected
|
|
||||||
([], 'average', []),
|
|
||||||
([], 'min', []),
|
|
||||||
([], 'max', []),
|
|
||||||
([], 'dense', []),
|
|
||||||
([], 'ordinal', []),
|
|
||||||
#
|
|
||||||
([100], 'average', [1.0]),
|
|
||||||
([100], 'min', [1.0]),
|
|
||||||
([100], 'max', [1.0]),
|
|
||||||
([100], 'dense', [1.0]),
|
|
||||||
([100], 'ordinal', [1.0]),
|
|
||||||
#
|
|
||||||
([100, 100, 100], 'average', [2.0, 2.0, 2.0]),
|
|
||||||
([100, 100, 100], 'min', [1.0, 1.0, 1.0]),
|
|
||||||
([100, 100, 100], 'max', [3.0, 3.0, 3.0]),
|
|
||||||
([100, 100, 100], 'dense', [1.0, 1.0, 1.0]),
|
|
||||||
([100, 100, 100], 'ordinal', [1.0, 2.0, 3.0]),
|
|
||||||
#
|
|
||||||
([100, 300, 200], 'average', [1.0, 3.0, 2.0]),
|
|
||||||
([100, 300, 200], 'min', [1.0, 3.0, 2.0]),
|
|
||||||
([100, 300, 200], 'max', [1.0, 3.0, 2.0]),
|
|
||||||
([100, 300, 200], 'dense', [1.0, 3.0, 2.0]),
|
|
||||||
([100, 300, 200], 'ordinal', [1.0, 3.0, 2.0]),
|
|
||||||
#
|
|
||||||
([100, 200, 300, 200], 'average', [1.0, 2.5, 4.0, 2.5]),
|
|
||||||
([100, 200, 300, 200], 'min', [1.0, 2.0, 4.0, 2.0]),
|
|
||||||
([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]),
|
|
||||||
([100, 200, 300, 200], 'dense', [1.0, 2.0, 3.0, 2.0]),
|
|
||||||
([100, 200, 300, 200], 'ordinal', [1.0, 2.0, 4.0, 3.0]),
|
|
||||||
#
|
|
||||||
([100, 200, 300, 200, 100], 'average', [1.5, 3.5, 5.0, 3.5, 1.5]),
|
|
||||||
([100, 200, 300, 200, 100], 'min', [1.0, 3.0, 5.0, 3.0, 1.0]),
|
|
||||||
([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]),
|
|
||||||
([100, 200, 300, 200, 100], 'dense', [1.0, 2.0, 3.0, 2.0, 1.0]),
|
|
||||||
([100, 200, 300, 200, 100], 'ordinal', [1.0, 3.0, 5.0, 4.0, 2.0]),
|
|
||||||
#
|
|
||||||
([10] * 30, 'ordinal', np.arange(1.0, 31.0)),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_cases():
|
|
||||||
|
|
||||||
def check_case(values, method, expected):
|
|
||||||
r = rankdata(values, method=method)
|
|
||||||
assert_array_equal(r, expected)
|
|
||||||
|
|
||||||
for values, method, expected in _cases:
|
|
||||||
yield check_case, values, method, expected
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run_module_suite()
|
|
File diff suppressed because it is too large
Load Diff
@ -1,91 +0,0 @@
|
|||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from numpy.testing import assert_allclose, assert_equal, run_module_suite
|
|
||||||
|
|
||||||
from scipy.stats._tukeylambda_stats import tukeylambda_variance, \
|
|
||||||
tukeylambda_kurtosis
|
|
||||||
|
|
||||||
|
|
||||||
def test_tukeylambda_stats_known_exact():
|
|
||||||
"""Compare results with some known exact formulas."""
|
|
||||||
# Some exact values of the Tukey Lambda variance and kurtosis:
|
|
||||||
# lambda var kurtosis
|
|
||||||
# 0 pi**2/3 6/5 (logistic distribution)
|
|
||||||
# 0.5 4 - pi (5/3 - pi/2)/(pi/4 - 1)**2 - 3
|
|
||||||
# 1 1/3 -6/5 (uniform distribution on (-1,1))
|
|
||||||
# 2 1/12 -6/5 (uniform distribution on (-1/2, 1/2))
|
|
||||||
|
|
||||||
# lambda = 0
|
|
||||||
var = tukeylambda_variance(0)
|
|
||||||
assert_allclose(var, np.pi**2 / 3, atol=1e-12)
|
|
||||||
kurt = tukeylambda_kurtosis(0)
|
|
||||||
assert_allclose(kurt, 1.2, atol=1e-10)
|
|
||||||
|
|
||||||
# lambda = 0.5
|
|
||||||
var = tukeylambda_variance(0.5)
|
|
||||||
assert_allclose(var, 4 - np.pi, atol=1e-12)
|
|
||||||
kurt = tukeylambda_kurtosis(0.5)
|
|
||||||
desired = (5./3 - np.pi/2) / (np.pi/4 - 1)**2 - 3
|
|
||||||
assert_allclose(kurt, desired, atol=1e-10)
|
|
||||||
|
|
||||||
# lambda = 1
|
|
||||||
var = tukeylambda_variance(1)
|
|
||||||
assert_allclose(var, 1.0 / 3, atol=1e-12)
|
|
||||||
kurt = tukeylambda_kurtosis(1)
|
|
||||||
assert_allclose(kurt, -1.2, atol=1e-10)
|
|
||||||
|
|
||||||
# lambda = 2
|
|
||||||
var = tukeylambda_variance(2)
|
|
||||||
assert_allclose(var, 1.0 / 12, atol=1e-12)
|
|
||||||
kurt = tukeylambda_kurtosis(2)
|
|
||||||
assert_allclose(kurt, -1.2, atol=1e-10)
|
|
||||||
|
|
||||||
|
|
||||||
def test_tukeylambda_stats_mpmath():
|
|
||||||
"""Compare results with some values that were computed using mpmath."""
|
|
||||||
a10 = dict(atol=1e-10, rtol=0)
|
|
||||||
a12 = dict(atol=1e-12, rtol=0)
|
|
||||||
data = [
|
|
||||||
# lambda variance kurtosis
|
|
||||||
[-0.1, 4.78050217874253547, 3.78559520346454510],
|
|
||||||
[-0.0649, 4.16428023599895777, 2.52019675947435718],
|
|
||||||
[-0.05, 3.93672267890775277, 2.13129793057777277],
|
|
||||||
[-0.001, 3.30128380390964882, 1.21452460083542988],
|
|
||||||
[0.001, 3.27850775649572176, 1.18560634779287585],
|
|
||||||
[0.03125, 2.95927803254615800, 0.804487555161819980],
|
|
||||||
[0.05, 2.78281053405464501, 0.611604043886644327],
|
|
||||||
[0.0649, 2.65282386754100551, 0.476834119532774540],
|
|
||||||
[1.2, 0.242153920578588346, -1.23428047169049726],
|
|
||||||
[10.0, 0.00095237579757703597, 2.37810697355144933],
|
|
||||||
[20.0, 0.00012195121951131043, 7.37654321002709531],
|
|
||||||
]
|
|
||||||
|
|
||||||
for lam, var_expected, kurt_expected in data:
|
|
||||||
var = tukeylambda_variance(lam)
|
|
||||||
assert_allclose(var, var_expected, **a12)
|
|
||||||
kurt = tukeylambda_kurtosis(lam)
|
|
||||||
assert_allclose(kurt, kurt_expected, **a10)
|
|
||||||
|
|
||||||
# Test with vector arguments (most of the other tests are for single
|
|
||||||
# values).
|
|
||||||
lam, var_expected, kurt_expected = zip(*data)
|
|
||||||
var = tukeylambda_variance(lam)
|
|
||||||
assert_allclose(var, var_expected, **a12)
|
|
||||||
kurt = tukeylambda_kurtosis(lam)
|
|
||||||
assert_allclose(kurt, kurt_expected, **a10)
|
|
||||||
|
|
||||||
|
|
||||||
def test_tukeylambda_stats_invalid():
|
|
||||||
"""Test values of lambda outside the domains of the functions."""
|
|
||||||
lam = [-1.0, -0.5]
|
|
||||||
var = tukeylambda_variance(lam)
|
|
||||||
assert_equal(var, np.array([np.nan, np.inf]))
|
|
||||||
|
|
||||||
lam = [-1.0, -0.25]
|
|
||||||
kurt = tukeylambda_kurtosis(lam)
|
|
||||||
assert_equal(kurt, np.array([np.nan, np.inf]))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run_module_suite()
|
|
@ -1,412 +0,0 @@
|
|||||||
"""
|
|
||||||
Commentary
|
|
||||||
----------
|
|
||||||
|
|
||||||
Most of the work is done by the scipy.stats.distributions module.
|
|
||||||
|
|
||||||
This provides a plethora of continuous distributions to play with.
|
|
||||||
|
|
||||||
Each distribution has functions to generate random deviates, pdf's,
|
|
||||||
cdf's etc. as well as a function to fit the distribution to some given
|
|
||||||
data.
|
|
||||||
|
|
||||||
The fitting uses scipy.optimize.fmin to minimise the log odds of the
|
|
||||||
data given the distribution.
|
|
||||||
|
|
||||||
There are a couple of problems with this approach. First it is
|
|
||||||
sensitive to the initial guess at the parameters. Second it can be a
|
|
||||||
little slow.
|
|
||||||
|
|
||||||
Two key parameters are the 'loc' and 'scale' parameters. Data is
|
|
||||||
shifted by 'loc' and scaled by scale prior to fitting. Supplying
|
|
||||||
appropriate values for these parameters is important to getting a good
|
|
||||||
fit.
|
|
||||||
|
|
||||||
See the factory() function which picks from a handful of common
|
|
||||||
approaches for each distribution.
|
|
||||||
|
|
||||||
For some distributions (eg normal) it really makes sense just to
|
|
||||||
calculate the parameters directly from the data.
|
|
||||||
|
|
||||||
The code in the __ifmain__ should be a good guide how to use this.
|
|
||||||
|
|
||||||
Simply:
|
|
||||||
get a QuickFit object
|
|
||||||
add the distributions you want to try to fit
|
|
||||||
call fit() with your data
|
|
||||||
call fit_stats() to generate some stats on the fit.
|
|
||||||
call plot() if you want to see a plot.
|
|
||||||
|
|
||||||
|
|
||||||
Named after Mrs Twolumps, minister's secretary in the silly walks
|
|
||||||
sketch, who brings in coffee with a full silly walk.
|
|
||||||
|
|
||||||
Tenuous link with curve fitting is that you generally see "two lumps"
|
|
||||||
one in your data and the other in the curve that is being fitted.
|
|
||||||
|
|
||||||
Or alternately, if your data is not too silly then you can fit a
|
|
||||||
curve to it.
|
|
||||||
|
|
||||||
License is GNU LGPL v3, see https://launchpad.net/twolumps
|
|
||||||
"""
|
|
||||||
import inspect
|
|
||||||
from itertools import izip
|
|
||||||
|
|
||||||
import numpy
|
|
||||||
from wafo import stats
|
|
||||||
from scipy import mean, std
|
|
||||||
|
|
||||||
def factory(name):
|
|
||||||
""" Factory to return appropriate objects for each distro. """
|
|
||||||
fitters = dict(
|
|
||||||
|
|
||||||
beta=ZeroOneScipyDistribution,
|
|
||||||
alpha=ZeroOneScipyDistribution,
|
|
||||||
ncf=ZeroOneScipyDistribution,
|
|
||||||
triang=ZeroOneScipyDistribution,
|
|
||||||
uniform=ZeroOneScipyDistribution,
|
|
||||||
powerlaw=ZeroOneScipyDistribution,
|
|
||||||
|
|
||||||
pareto=MinLocScipyDistribution,
|
|
||||||
expon=MinLocScipyDistribution,
|
|
||||||
gamma=MinLocScipyDistribution,
|
|
||||||
lognorm=MinLocScipyDistribution,
|
|
||||||
maxwell=MinLocScipyDistribution,
|
|
||||||
weibull_min=MinLocScipyDistribution,
|
|
||||||
|
|
||||||
weibull_max=MaxLocScipyDistribution)
|
|
||||||
|
|
||||||
return fitters.get(name, ScipyDistribution)(name)
|
|
||||||
|
|
||||||
|
|
||||||
def get_continuous_distros():
|
|
||||||
""" Find all attributes of stats that are continuous distributions. """
|
|
||||||
|
|
||||||
fitters = []
|
|
||||||
skip = set()
|
|
||||||
for name, item in inspect.getmembers(stats):
|
|
||||||
if name in skip: continue
|
|
||||||
if item is stats.rv_continuous: continue
|
|
||||||
if isinstance(item, stats.rv_continuous):
|
|
||||||
fitters.append([name, factory(name)])
|
|
||||||
|
|
||||||
return fitters
|
|
||||||
|
|
||||||
|
|
||||||
class ScipyDistribution(object):
|
|
||||||
|
|
||||||
def __init__(self, name):
|
|
||||||
|
|
||||||
self.name = name
|
|
||||||
self.distro = self.get_distro()
|
|
||||||
self.fitted = None
|
|
||||||
|
|
||||||
def __getattr__(self, attr):
|
|
||||||
""" Try delegating to the distro object """
|
|
||||||
return getattr(self.distro, attr)
|
|
||||||
|
|
||||||
def get_distro(self):
|
|
||||||
|
|
||||||
return getattr(stats, self.name)
|
|
||||||
|
|
||||||
def set_distro(self, parms):
|
|
||||||
|
|
||||||
self.distro = getattr(stats, self.name)(*parms)
|
|
||||||
|
|
||||||
return self.distro
|
|
||||||
|
|
||||||
def calculate_loc_and_scale(self, data):
|
|
||||||
""" Calculate loc and scale parameters for fit.
|
|
||||||
|
|
||||||
Depending on the distribution, these need to be approximately
|
|
||||||
right to get a good fit.
|
|
||||||
"""
|
|
||||||
return mean(data), std(data)
|
|
||||||
|
|
||||||
def fit(self, data, *args, **kwargs):
|
|
||||||
""" This needs some work.
|
|
||||||
|
|
||||||
Seems the various scipy distributions do a reasonable job if given a good hint.
|
|
||||||
|
|
||||||
Need to get distro specific hints.
|
|
||||||
"""
|
|
||||||
|
|
||||||
fits = []
|
|
||||||
|
|
||||||
# try with and without providing loc and scale hints
|
|
||||||
# increases chance of a fit without an exception being
|
|
||||||
# generated.
|
|
||||||
for (loc, scale) in ((0.0, 1.0),
|
|
||||||
self.calculate_loc_and_scale(data)):
|
|
||||||
|
|
||||||
try:
|
|
||||||
parms = self.get_distro().fit(data, loc=loc, scale=scale)
|
|
||||||
|
|
||||||
self.set_distro(list(parms))
|
|
||||||
expected = self.expected(data)
|
|
||||||
rss = ((expected-data)**2).sum()
|
|
||||||
fits.append([rss, list(parms)])
|
|
||||||
|
|
||||||
parms = self.get_distro().fit(data, floc=loc, scale=scale)
|
|
||||||
|
|
||||||
self.set_distro(list(parms))
|
|
||||||
expected = self.expected(data)
|
|
||||||
rss = ((expected-data)**2).sum()
|
|
||||||
fits.append([rss, list(parms)])
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# no fits means all tries raised exceptions
|
|
||||||
if not fits:
|
|
||||||
raise Exception("Exception in fit()")
|
|
||||||
|
|
||||||
# pick the one with the smallest rss
|
|
||||||
fits.sort()
|
|
||||||
self.parms = fits[0][1]
|
|
||||||
print self.parms
|
|
||||||
|
|
||||||
return self.set_distro(list(self.parms))
|
|
||||||
|
|
||||||
def expected(self, data):
|
|
||||||
""" Calculate expected values at each data point """
|
|
||||||
if self.fitted is not None:
|
|
||||||
return self.fitted
|
|
||||||
|
|
||||||
n = len(data)
|
|
||||||
xx = numpy.linspace(0, 1, n + 2)[1:-1]
|
|
||||||
self.fitted = self.ppf(xx)
|
|
||||||
#self.fitted = [self.ppf(x) for x in xx]
|
|
||||||
|
|
||||||
return self.fitted
|
|
||||||
|
|
||||||
def fit_stats(self, data):
|
|
||||||
""" Return stats on the fits
|
|
||||||
|
|
||||||
data assumed to be sorted.
|
|
||||||
"""
|
|
||||||
n = len(data)
|
|
||||||
|
|
||||||
dvar = numpy.var(data)
|
|
||||||
expected = self.expected(data)
|
|
||||||
evar = numpy.var(expected)
|
|
||||||
|
|
||||||
rss = 0.0
|
|
||||||
for expect, obs in izip(expected, data):
|
|
||||||
rss += (obs-expect) ** 2.0
|
|
||||||
|
|
||||||
self.rss = rss
|
|
||||||
self.dss = dvar * n
|
|
||||||
self.fss = evar * n
|
|
||||||
|
|
||||||
def residuals(self, data):
|
|
||||||
""" Return residuals """
|
|
||||||
expected = self.expected(data)
|
|
||||||
|
|
||||||
return numpy.array(data) - numpy.array(expected)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class MinLocScipyDistribution(ScipyDistribution):
|
|
||||||
|
|
||||||
def calculate_loc_and_scale(self, data):
|
|
||||||
""" Set loc to min value in the data.
|
|
||||||
|
|
||||||
Useful for weibull_min
|
|
||||||
"""
|
|
||||||
return min(data), std(data)
|
|
||||||
|
|
||||||
class MaxLocScipyDistribution(ScipyDistribution):
|
|
||||||
|
|
||||||
def calculate_loc_and_scale(self, data):
|
|
||||||
""" Set loc to max value in the data.
|
|
||||||
|
|
||||||
Useful for weibull_max
|
|
||||||
"""
|
|
||||||
return max(data), std(data)
|
|
||||||
|
|
||||||
class ZeroOneScipyDistribution(ScipyDistribution):
|
|
||||||
|
|
||||||
def calculate_loc_and_scale(self, data):
|
|
||||||
""" Set loc and scale to move to [0, 1] interval.
|
|
||||||
|
|
||||||
Useful for beta distribution
|
|
||||||
"""
|
|
||||||
return min(data), max(data)-min(data)
|
|
||||||
|
|
||||||
class QuickFit(object):
|
|
||||||
""" Fit a family of distributions.
|
|
||||||
|
|
||||||
Calculates stats on each fit.
|
|
||||||
|
|
||||||
Option to create plots.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
|
|
||||||
self.distributions = []
|
|
||||||
|
|
||||||
def add_distribution(self, distribution):
|
|
||||||
""" Add a ready-prepared ScipyDistribution """
|
|
||||||
self.distributions.append(distribution)
|
|
||||||
|
|
||||||
def add(self, name):
|
|
||||||
""" Add a distribution by name. """
|
|
||||||
|
|
||||||
self.distributions.append(factory(name))
|
|
||||||
|
|
||||||
def fit(self, data):
|
|
||||||
""" Fit all of the distros we have """
|
|
||||||
fitted = []
|
|
||||||
for distro in self.distributions:
|
|
||||||
print 'fitting distro', distro.name
|
|
||||||
try:
|
|
||||||
distro.fit(data)
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
fitted.append(distro)
|
|
||||||
self.distributions = fitted
|
|
||||||
|
|
||||||
print 'finished fitting'
|
|
||||||
|
|
||||||
def stats(self, data):
|
|
||||||
""" Return stats on the fits """
|
|
||||||
for dd in self.distributions:
|
|
||||||
dd.fit_stats(data)
|
|
||||||
|
|
||||||
def get_topn(self, n):
|
|
||||||
""" Return top-n best fits. """
|
|
||||||
data = [[x.rss, x] for x in self.distributions if numpy.isfinite(x.rss)]
|
|
||||||
data.sort()
|
|
||||||
|
|
||||||
if not n:
|
|
||||||
n = len(data)
|
|
||||||
|
|
||||||
return [x[1] for x in data[:n]]
|
|
||||||
|
|
||||||
def fit_plot(self, data, topn=0, bins=20):
|
|
||||||
""" Create a plot. """
|
|
||||||
from matplotlib import pylab as pl
|
|
||||||
|
|
||||||
distros = self.get_topn(topn)
|
|
||||||
|
|
||||||
xx = numpy.linspace(data.min(), data.max(), 300)
|
|
||||||
|
|
||||||
table = []
|
|
||||||
nparms = max(len(x.parms) for x in distros)
|
|
||||||
tcolours = []
|
|
||||||
for dd in distros:
|
|
||||||
patch = pl.plot(xx, [dd.pdf(p) for p in xx], label='%10.2f%% %s' % (100.0*dd.rss/dd.dss, dd.name))
|
|
||||||
row = ['', dd.name, '%10.2f%%' % (100.0*dd.rss/dd.dss,)] + ['%0.2f' % x for x in dd.parms]
|
|
||||||
while len(row) < 3 + nparms:
|
|
||||||
row.append('')
|
|
||||||
table.append(row)
|
|
||||||
tcolours.append([patch[0].get_markerfacecolor()] + ['w'] * (2+nparms))
|
|
||||||
|
|
||||||
# add a historgram with the data
|
|
||||||
pl.hist(data, bins=bins, normed=True)
|
|
||||||
tab = pl.table(cellText=table, cellColours=tcolours,
|
|
||||||
colLabels=['', 'Distribution', 'Res. SS/Data SS'] + ['P%d' % (x + 1,) for x in range(nparms)],
|
|
||||||
bbox=(0.0, 1.0, 1.0, 0.3))
|
|
||||||
#loc='top'))
|
|
||||||
#pl.legend(loc=0)
|
|
||||||
tab.auto_set_font_size(False)
|
|
||||||
tab.set_fontsize(10.)
|
|
||||||
|
|
||||||
def residual_plot(self, data, topn=0):
|
|
||||||
""" Create a residual plot. """
|
|
||||||
from matplotlib import pylab as pl
|
|
||||||
|
|
||||||
distros = self.get_topn(topn)
|
|
||||||
|
|
||||||
|
|
||||||
n = len(data)
|
|
||||||
xx = numpy.linspace(0, 1, n + 2)[1:-1]
|
|
||||||
for dd in distros:
|
|
||||||
|
|
||||||
pl.plot(xx, dd.residuals(data), label='%10.2f%% %s' % (100.0*dd.rss/dd.dss, dd.name))
|
|
||||||
pl.grid(True)
|
|
||||||
|
|
||||||
def plot(self, data, topn):
|
|
||||||
""" Plot data fit and residuals """
|
|
||||||
from matplotlib import pylab as pl
|
|
||||||
pl.axes([0.1, 0.4, 0.8, 0.4]) # leave room above the axes for the table
|
|
||||||
self.fit_plot(data, topn=topn)
|
|
||||||
|
|
||||||
pl.axes([0.1, 0.05, 0.8, 0.3])
|
|
||||||
self.residual_plot(data, topn=topn)
|
|
||||||
|
|
||||||
|
|
||||||
def read_data(infile, field):
|
|
||||||
""" Simple utility to extract a field out of a csv file. """
|
|
||||||
import csv
|
|
||||||
|
|
||||||
reader = csv.reader(infile)
|
|
||||||
header = reader.next()
|
|
||||||
field = header.index(field)
|
|
||||||
data = []
|
|
||||||
for row in reader:
|
|
||||||
data.append(float(row[field]))
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import optparse
|
|
||||||
|
|
||||||
from matplotlib import pylab as pl
|
|
||||||
|
|
||||||
parser = optparse.OptionParser()
|
|
||||||
parser.add_option('-d', '--distro', action='append', default=[])
|
|
||||||
parser.add_option('-l', '--list', action='store_true',
|
|
||||||
help='List available distros')
|
|
||||||
|
|
||||||
parser.add_option('-i', '--infile')
|
|
||||||
parser.add_option('-f', '--field', default='P/L')
|
|
||||||
|
|
||||||
parser.add_option('-n', '--topn', type='int', default=0)
|
|
||||||
|
|
||||||
parser.add_option('-s', '--sample', default='normal',
|
|
||||||
help='generate a sample from this distro as a test')
|
|
||||||
parser.add_option('--size', type='int', default=1000,
|
|
||||||
help='Size of sample to generate')
|
|
||||||
|
|
||||||
|
|
||||||
opts, args = parser.parse_args()
|
|
||||||
|
|
||||||
if opts.list:
|
|
||||||
for name, distro in get_continuous_distros():
|
|
||||||
print name
|
|
||||||
sys.exit()
|
|
||||||
opts.distro = ['weibull_min', 'norm']
|
|
||||||
if not opts.distro:
|
|
||||||
opts.distro = [x[0] for x in get_continuous_distros()]
|
|
||||||
|
|
||||||
quickfit = QuickFit()
|
|
||||||
for distro in opts.distro:
|
|
||||||
quickfit.add(distro)
|
|
||||||
|
|
||||||
if opts.sample:
|
|
||||||
data = getattr(numpy.random, opts.sample)(size=opts.size)
|
|
||||||
else:
|
|
||||||
data = numpy.array(read_data(open(opts.infile), opts.field))
|
|
||||||
|
|
||||||
data.sort()
|
|
||||||
|
|
||||||
quickfit.fit(data)
|
|
||||||
print 'doing stats'
|
|
||||||
quickfit.stats(data)
|
|
||||||
|
|
||||||
print 'doing plot'
|
|
||||||
quickfit.plot(data, topn=opts.topn)
|
|
||||||
pl.show()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,47 +0,0 @@
|
|||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import scipy.stats
|
|
||||||
from scipy.special import i0
|
|
||||||
|
|
||||||
|
|
||||||
def von_mises_cdf_series(k,x,p):
|
|
||||||
x = float(x)
|
|
||||||
s = np.sin(x)
|
|
||||||
c = np.cos(x)
|
|
||||||
sn = np.sin(p*x)
|
|
||||||
cn = np.cos(p*x)
|
|
||||||
R = 0
|
|
||||||
V = 0
|
|
||||||
for n in range(p-1,0,-1):
|
|
||||||
sn, cn = sn*c - cn*s, cn*c + sn*s
|
|
||||||
R = 1./(2*n/k + R)
|
|
||||||
V = R*(sn/n+V)
|
|
||||||
|
|
||||||
return 0.5+x/(2*np.pi) + V/np.pi
|
|
||||||
|
|
||||||
|
|
||||||
def von_mises_cdf_normalapprox(k,x,C1):
|
|
||||||
b = np.sqrt(2/np.pi)*np.exp(k)/i0(k)
|
|
||||||
z = b*np.sin(x/2.)
|
|
||||||
return scipy.stats.norm.cdf(z)
|
|
||||||
|
|
||||||
|
|
||||||
def von_mises_cdf(k,x):
|
|
||||||
ix = 2*np.pi*np.round(x/(2*np.pi))
|
|
||||||
x = x-ix
|
|
||||||
k = float(k)
|
|
||||||
|
|
||||||
# These values should give 12 decimal digits
|
|
||||||
CK = 50
|
|
||||||
a = [28., 0.5, 100., 5.0]
|
|
||||||
C1 = 50.1
|
|
||||||
|
|
||||||
if k < CK:
|
|
||||||
p = int(np.ceil(a[0]+a[1]*k-a[2]/(k+a[3])))
|
|
||||||
|
|
||||||
F = np.clip(von_mises_cdf_series(k,x,p),0,1)
|
|
||||||
else:
|
|
||||||
F = von_mises_cdf_normalapprox(k,x,C1)
|
|
||||||
|
|
||||||
return F+ix
|
|
@ -1,76 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
import scipy.stats
|
|
||||||
from scipy.special import i0
|
|
||||||
import numpy.testing
|
|
||||||
cimport numpy as np
|
|
||||||
|
|
||||||
cdef extern from "math.h":
|
|
||||||
double cos(double theta)
|
|
||||||
double sin(double theta)
|
|
||||||
|
|
||||||
|
|
||||||
cdef double von_mises_cdf_series(double k,double x,unsigned int p):
|
|
||||||
cdef double s, c, sn, cn, R, V
|
|
||||||
cdef unsigned int n
|
|
||||||
s = sin(x)
|
|
||||||
c = cos(x)
|
|
||||||
sn = sin(p*x)
|
|
||||||
cn = cos(p*x)
|
|
||||||
R = 0
|
|
||||||
V = 0
|
|
||||||
for n in range(p-1,0,-1):
|
|
||||||
sn, cn = sn*c - cn*s, cn*c + sn*s
|
|
||||||
R = 1./(2*n/k + R)
|
|
||||||
V = R*(sn/n+V)
|
|
||||||
|
|
||||||
return 0.5+x/(2*np.pi) + V/np.pi
|
|
||||||
|
|
||||||
def von_mises_cdf_normalapprox(k,x,C1):
|
|
||||||
b = np.sqrt(2/np.pi)*np.exp(k)/i0(k)
|
|
||||||
z = b*np.sin(x/2.)
|
|
||||||
C = 24*k
|
|
||||||
chi = z - z**3/((C-2*z**2-16)/3.-(z**4+7/4.*z**2+167./2)/(C+C1-z**2+3))**2
|
|
||||||
return scipy.stats.norm.cdf(z)
|
|
||||||
|
|
||||||
cimport cython
|
|
||||||
@cython.boundscheck(False)
|
|
||||||
def von_mises_cdf(k,x):
|
|
||||||
cdef np.ndarray[double, ndim=1] temp, temp_xs, temp_ks
|
|
||||||
cdef unsigned int i, p
|
|
||||||
cdef double a1, a2, a3, a4, C1, CK
|
|
||||||
#k,x = np.broadcast_arrays(np.asarray(k),np.asarray(x))
|
|
||||||
k = np.asarray(k)
|
|
||||||
x = np.asarray(x)
|
|
||||||
zerodim = k.ndim==0 and x.ndim==0
|
|
||||||
|
|
||||||
k = np.atleast_1d(k)
|
|
||||||
x = np.atleast_1d(x)
|
|
||||||
ix = np.round(x/(2*np.pi))
|
|
||||||
x = x-ix*2*np.pi
|
|
||||||
|
|
||||||
# These values should give 12 decimal digits
|
|
||||||
CK=50
|
|
||||||
a1, a2, a3, a4 = [28., 0.5, 100., 5.0]
|
|
||||||
C1 = 50.1
|
|
||||||
|
|
||||||
bx, bk = np.broadcast_arrays(x,k)
|
|
||||||
result = np.empty(bx.shape,dtype=np.float)
|
|
||||||
|
|
||||||
c_small_k = bk<CK
|
|
||||||
temp = result[c_small_k]
|
|
||||||
temp_xs = bx[c_small_k].astype(np.float)
|
|
||||||
temp_ks = bk[c_small_k].astype(np.float)
|
|
||||||
for i in range(len(temp)):
|
|
||||||
p = <int>(1+a1+a2*temp_ks[i]-a3/(temp_ks[i]+a4))
|
|
||||||
temp[i] = von_mises_cdf_series(temp_ks[i],temp_xs[i],p)
|
|
||||||
if temp[i]<0:
|
|
||||||
temp[i]=0
|
|
||||||
elif temp[i]>1:
|
|
||||||
temp[i]=1
|
|
||||||
result[c_small_k] = temp
|
|
||||||
result[~c_small_k] = von_mises_cdf_normalapprox(bk[~c_small_k],bx[~c_small_k],C1)
|
|
||||||
|
|
||||||
if not zerodim:
|
|
||||||
return result+ix
|
|
||||||
else:
|
|
||||||
return (result+ix)[0]
|
|
Loading…
Reference in New Issue