Fixed more bugs in distributions.py
parent
6b88f2d4cc
commit
0bfe623f5c
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,402 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from scipy.lib.six import callable
|
||||
|
||||
|
||||
def binned_statistic(x, values, statistic='mean',
|
||||
bins=10, range=None):
|
||||
"""
|
||||
Compute a binned statistic for a set of data.
|
||||
|
||||
This is a generalization of a histogram function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values within each bin.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
A sequence of values to be binned.
|
||||
values : array_like
|
||||
The values on which the statistic will be computed. This must be
|
||||
the same shape as `x`.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : int or sequence of scalars, optional
|
||||
If `bins` is an int, it defines the number of equal-width
|
||||
bins in the given range (10, by default). If `bins` is a sequence,
|
||||
it defines the bin edges, including the rightmost edge, allowing
|
||||
for non-uniform bin widths.
|
||||
range : (float, float) or [(float, float)], optional
|
||||
The lower and upper range of the bins. If not provided, range
|
||||
is simply ``(x.min(), x.max())``. Values outside the range are
|
||||
ignored.
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : array
|
||||
The values of the selected statistic in each bin.
|
||||
bin_edges : array of dtype float
|
||||
Return the bin edges ``(length(statistic)+1)``.
|
||||
binnumber : 1-D ndarray of ints
|
||||
This assigns to each observation an integer that represents the bin
|
||||
in which this observation falls. Array has the same length as values.
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.histogram, binned_statistic_2d, binned_statistic_dd
|
||||
|
||||
Notes
|
||||
-----
|
||||
All but the last (righthand-most) bin is half-open. In other words, if
|
||||
`bins` is::
|
||||
|
||||
[1, 2, 3, 4]
|
||||
|
||||
then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the
|
||||
second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes*
|
||||
4.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
|
||||
... bins=3)
|
||||
(array([ 1., 2., 4.]), array([ 1., 2., 3., 4.]), array([1, 2, 1, 2, 3]))
|
||||
|
||||
>>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean', bins=3)
|
||||
(array([ 1., 2., 4.]), array([ 1., 2., 3., 4.]), array([1, 2, 1, 2, 3]))
|
||||
|
||||
"""
|
||||
try:
|
||||
N = len(bins)
|
||||
except TypeError:
|
||||
N = 1
|
||||
|
||||
if N != 1:
|
||||
bins = [np.asarray(bins, float)]
|
||||
|
||||
if range is not None:
|
||||
if len(range) == 2:
|
||||
range = [range]
|
||||
|
||||
medians, edges, xy = binned_statistic_dd([x], values, statistic,
|
||||
bins, range)
|
||||
|
||||
return medians, edges[0], xy
|
||||
|
||||
|
||||
def binned_statistic_2d(x, y, values, statistic='mean',
|
||||
bins=10, range=None):
|
||||
"""
|
||||
Compute a bidimensional binned statistic for a set of data.
|
||||
|
||||
This is a generalization of a histogram2d function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values within each bin.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : (N,) array_like
|
||||
A sequence of values to be binned along the first dimension.
|
||||
y : (M,) array_like
|
||||
A sequence of values to be binned along the second dimension.
|
||||
values : (N,) array_like
|
||||
The values on which the statistic will be computed. This must be
|
||||
the same shape as `x`.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : int or [int, int] or array-like or [array, array], optional
|
||||
The bin specification:
|
||||
|
||||
* the number of bins for the two dimensions (nx=ny=bins),
|
||||
* the number of bins in each dimension (nx, ny = bins),
|
||||
* the bin edges for the two dimensions (x_edges = y_edges = bins),
|
||||
* the bin edges in each dimension (x_edges, y_edges = bins).
|
||||
|
||||
range : (2,2) array_like, optional
|
||||
The leftmost and rightmost edges of the bins along each dimension
|
||||
(if not specified explicitly in the `bins` parameters):
|
||||
[[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
|
||||
considered outliers and not tallied in the histogram.
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : (nx, ny) ndarray
|
||||
The values of the selected statistic in each two-dimensional bin
|
||||
xedges : (nx + 1) ndarray
|
||||
The bin edges along the first dimension.
|
||||
yedges : (ny + 1) ndarray
|
||||
The bin edges along the second dimension.
|
||||
binnumber : 1-D ndarray of ints
|
||||
This assigns to each observation an integer that represents the bin
|
||||
in which this observation falls. Array has the same length as `values`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.histogram2d, binned_statistic, binned_statistic_dd
|
||||
|
||||
"""
|
||||
|
||||
# This code is based on np.histogram2d
|
||||
try:
|
||||
N = len(bins)
|
||||
except TypeError:
|
||||
N = 1
|
||||
|
||||
if N != 1 and N != 2:
|
||||
xedges = yedges = np.asarray(bins, float)
|
||||
bins = [xedges, yedges]
|
||||
|
||||
medians, edges, xy = binned_statistic_dd([x, y], values, statistic,
|
||||
bins, range)
|
||||
|
||||
return medians, edges[0], edges[1], xy
|
||||
|
||||
|
||||
def binned_statistic_dd(sample, values, statistic='mean',
|
||||
bins=10, range=None):
|
||||
"""
|
||||
Compute a multidimensional binned statistic for a set of data.
|
||||
|
||||
This is a generalization of a histogramdd function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values within each bin.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sample : array_like
|
||||
Data to histogram passed as a sequence of D arrays of length N, or
|
||||
as an (N,D) array.
|
||||
values : array_like
|
||||
The values on which the statistic will be computed. This must be
|
||||
the same shape as x.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : sequence or int, optional
|
||||
The bin specification:
|
||||
|
||||
* A sequence of arrays describing the bin edges along each dimension.
|
||||
* The number of bins for each dimension (nx, ny, ... =bins)
|
||||
* The number of bins for all dimensions (nx=ny=...=bins).
|
||||
|
||||
range : sequence, optional
|
||||
A sequence of lower and upper bin edges to be used if the edges are
|
||||
not given explicitely in `bins`. Defaults to the minimum and maximum
|
||||
values along each dimension.
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : ndarray, shape(nx1, nx2, nx3,...)
|
||||
The values of the selected statistic in each two-dimensional bin
|
||||
edges : list of ndarrays
|
||||
A list of D arrays describing the (nxi + 1) bin edges for each
|
||||
dimension
|
||||
binnumber : 1-D ndarray of ints
|
||||
This assigns to each observation an integer that represents the bin
|
||||
in which this observation falls. Array has the same length as values.
|
||||
|
||||
See Also
|
||||
--------
|
||||
np.histogramdd, binned_statistic, binned_statistic_2d
|
||||
|
||||
"""
|
||||
if type(statistic) == str:
|
||||
if statistic not in ['mean', 'median', 'count', 'sum', 'std']:
|
||||
raise ValueError('unrecognized statistic "%s"' % statistic)
|
||||
elif callable(statistic):
|
||||
pass
|
||||
else:
|
||||
raise ValueError("statistic not understood")
|
||||
|
||||
# This code is based on np.histogramdd
|
||||
try:
|
||||
# Sample is an ND-array.
|
||||
N, D = sample.shape
|
||||
except (AttributeError, ValueError):
|
||||
# Sample is a sequence of 1D arrays.
|
||||
sample = np.atleast_2d(sample).T
|
||||
N, D = sample.shape
|
||||
|
||||
nbin = np.empty(D, int)
|
||||
edges = D * [None]
|
||||
dedges = D * [None]
|
||||
|
||||
try:
|
||||
M = len(bins)
|
||||
if M != D:
|
||||
raise AttributeError('The dimension of bins must be equal '
|
||||
'to the dimension of the sample x.')
|
||||
except TypeError:
|
||||
bins = D * [bins]
|
||||
|
||||
# Select range for each dimension
|
||||
# Used only if number of bins is given.
|
||||
if range is None:
|
||||
smin = np.atleast_1d(np.array(sample.min(0), float))
|
||||
smax = np.atleast_1d(np.array(sample.max(0), float))
|
||||
else:
|
||||
smin = np.zeros(D)
|
||||
smax = np.zeros(D)
|
||||
for i in np.arange(D):
|
||||
smin[i], smax[i] = range[i]
|
||||
|
||||
# Make sure the bins have a finite width.
|
||||
for i in np.arange(len(smin)):
|
||||
if smin[i] == smax[i]:
|
||||
smin[i] = smin[i] - .5
|
||||
smax[i] = smax[i] + .5
|
||||
|
||||
# Create edge arrays
|
||||
for i in np.arange(D):
|
||||
if np.isscalar(bins[i]):
|
||||
nbin[i] = bins[i] + 2 # +2 for outlier bins
|
||||
edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1)
|
||||
else:
|
||||
edges[i] = np.asarray(bins[i], float)
|
||||
nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
|
||||
dedges[i] = np.diff(edges[i])
|
||||
|
||||
nbin = np.asarray(nbin)
|
||||
|
||||
# Compute the bin number each sample falls into.
|
||||
Ncount = {}
|
||||
for i in np.arange(D):
|
||||
Ncount[i] = np.digitize(sample[:, i], edges[i])
|
||||
|
||||
# Using digitize, values that fall on an edge are put in the right bin.
|
||||
# For the rightmost bin, we want values equal to the right
|
||||
# edge to be counted in the last bin, and not as an outlier.
|
||||
for i in np.arange(D):
|
||||
# Rounding precision
|
||||
decimal = int(-np.log10(dedges[i].min())) + 6
|
||||
# Find which points are on the rightmost edge.
|
||||
on_edge = np.where(np.around(sample[:, i], decimal)
|
||||
== np.around(edges[i][-1], decimal))[0]
|
||||
# Shift these points one bin to the left.
|
||||
Ncount[i][on_edge] -= 1
|
||||
|
||||
# Compute the sample indices in the flattened statistic matrix.
|
||||
ni = nbin.argsort()
|
||||
xy = np.zeros(N, int)
|
||||
for i in np.arange(0, D - 1):
|
||||
xy += Ncount[ni[i]] * nbin[ni[i + 1:]].prod()
|
||||
xy += Ncount[ni[-1]]
|
||||
|
||||
result = np.empty(nbin.prod(), float)
|
||||
|
||||
if statistic == 'mean':
|
||||
result.fill(np.nan)
|
||||
flatcount = np.bincount(xy, None)
|
||||
flatsum = np.bincount(xy, values)
|
||||
a = flatcount.nonzero()
|
||||
result[a] = flatsum[a] / flatcount[a]
|
||||
elif statistic == 'std':
|
||||
result.fill(0)
|
||||
flatcount = np.bincount(xy, None)
|
||||
flatsum = np.bincount(xy, values)
|
||||
flatsum2 = np.bincount(xy, values ** 2)
|
||||
a = flatcount.nonzero()
|
||||
result[a] = np.sqrt(flatsum2[a] / flatcount[a]
|
||||
- (flatsum[a] / flatcount[a]) ** 2)
|
||||
elif statistic == 'count':
|
||||
result.fill(0)
|
||||
flatcount = np.bincount(xy, None)
|
||||
a = np.arange(len(flatcount))
|
||||
result[a] = flatcount
|
||||
elif statistic == 'sum':
|
||||
result.fill(0)
|
||||
flatsum = np.bincount(xy, values)
|
||||
a = np.arange(len(flatsum))
|
||||
result[a] = flatsum
|
||||
elif statistic == 'median':
|
||||
result.fill(np.nan)
|
||||
for i in np.unique(xy):
|
||||
result[i] = np.median(values[xy == i])
|
||||
elif callable(statistic):
|
||||
with warnings.catch_warnings():
|
||||
# Numpy generates a warnings for mean/std/... with empty list
|
||||
warnings.filterwarnings('ignore', category=RuntimeWarning)
|
||||
old = np.seterr(invalid='ignore')
|
||||
try:
|
||||
null = statistic([])
|
||||
except:
|
||||
null = np.nan
|
||||
np.seterr(**old)
|
||||
result.fill(null)
|
||||
for i in np.unique(xy):
|
||||
result[i] = statistic(values[xy == i])
|
||||
|
||||
# Shape into a proper matrix
|
||||
result = result.reshape(np.sort(nbin))
|
||||
for i in np.arange(nbin.size):
|
||||
j = ni.argsort()[i]
|
||||
result = result.swapaxes(i, j)
|
||||
ni[i], ni[j] = ni[j], ni[i]
|
||||
|
||||
# Remove outliers (indices 0 and -1 for each dimension).
|
||||
core = D * [slice(1, -1)]
|
||||
result = result[core]
|
||||
|
||||
if (result.shape != nbin - 2).any():
|
||||
raise RuntimeError('Internal Shape Error')
|
||||
|
||||
return result, edges, xy
|
@ -0,0 +1,24 @@
|
||||
"""
|
||||
Statistics-related constants.
|
||||
|
||||
"""
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
# The smallest representable positive number such that 1.0 + _EPS != 1.0.
|
||||
_EPS = np.finfo(float).eps
|
||||
|
||||
# The largest [in magnitude] usable floating value.
|
||||
_XMAX = np.finfo(float).machar.xmax
|
||||
|
||||
# The smallest [in magnitude] usable floating value.
|
||||
_XMIN = np.finfo(float).machar.xmin
|
||||
|
||||
# -special.psi(1)
|
||||
_EULER = 0.577215664901532860606512090082402431042
|
||||
|
||||
# special.zeta(3, 1) Apery's constant
|
||||
_ZETA3 = 1.202056903159594285399738161511449990765
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,15 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Dec 06 16:02:47 2011
|
||||
|
||||
@author: pab
|
||||
"""
|
||||
import numpy as np
|
||||
import wafo.kdetools as wk
|
||||
n = 100
|
||||
x = np.sort(5*np.random.rand(1,n)-2.5, axis=-1).ravel()
|
||||
y = (np.cos(x)>2*np.random.rand(n, 1)-1).ravel()
|
||||
|
||||
kreg = wk.KRegression(x,y)
|
||||
f = kreg(output='plotobj', title='Kernel regression', plotflag=1)
|
||||
f.plot()
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue