You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

368 lines
12 KiB
Python

'''
Created on 15. des. 2016
@author: pab
'''
from __future__ import absolute_import, division, print_function
from scipy import sparse
import numpy as np
from itertools import product
__all__ = ['accum', 'gridcount']
def _assert(cond, msg):
if not cond:
raise ValueError(msg)
def bitget(int_type, offset):
"""Returns the value of the bit at the offset position in int_type.
Example
-------
>>> bitget(5, np.r_[0:4])
array([1, 0, 1, 0])
"""
return np.bitwise_and(int_type, 1 << offset) >> offset
def accumsum_sparse(accmap, a, shape=None, dtype=None):
"""
Example
-------
>>> from numpy import array
>>> a = array([[1,2,3],[4,-1,6],[-1,8,9]])
>>> a
array([[ 1, 2, 3],
[ 4, -1, 6],
[-1, 8, 9]])
>>> # Sum the diagonals.
>>> accmap = array([[0,1,2],[2,0,1],[1,2,0]])
>>> s = accumsum_sparse(accmap, a, (3,))
>>> np.allclose(s.toarray(), [ 9, 7, 15])
True
# group vals by idx and sum
>>> vals = array([12.0, 3.2, -15, 88, 12.9])
>>> idx = array([1, 0, 1, 4, 1 ])
>>> np.allclose(accumsum_sparse(idx, vals).toarray(),
... [3.2, 9.9, 0, 0, 88.])
True
"""
if dtype is None:
dtype = a.dtype
if shape is None:
shape = 1 + np.squeeze(np.apply_over_axes(np.max, accmap,
axes=tuple(range(a.ndim))))
shape = np.atleast_1d(shape)
if len(shape) > 1:
binx = accmap[:, 0]
biny = accmap[:, 1]
out = sparse.coo_matrix(
(a.ravel(), (binx, biny)), shape=shape, dtype=dtype).tocsr()
else:
binx = accmap.ravel()
zero = np.zeros(len(binx))
out = sparse.coo_matrix(
(a.ravel(), (binx, zero)), shape=(shape, 1), dtype=dtype).tocsr().T
return out
def accumsum(accmap, a, shape=None):
"""
Example
-------
>>> from numpy import array
>>> a = array([[1,2,3],[4,-1,6],[-1,8,9]])
>>> a
array([[ 1, 2, 3],
[ 4, -1, 6],
[-1, 8, 9]])
>>> accmap = array([[0,1,2],[2,0,1],[1,2,0]]) # Sum the diagonals.
>>> np.allclose(accumsum(accmap, a, (3,)), [ 9, 7, 15])
True
>>> accmap = array([[0,1,2],[0,1,2],[0,1,2]]) # Sum the columns.
>>> np.allclose(accumsum(accmap, a, (3,)), [4, 9, 18])
True
# group vals by idx and sum
>>> vals = array([12.0, 3.2, -15, 88, 12.9])
>>> idx = array([1, 0, 1, 4, 1 ])
>>> np.allclose(accumsum(idx, vals), [3.2, 9.9, 0, 0, 88.])
True
"""
if shape is None:
shape = 1 + np.squeeze(np.apply_over_axes(np.max, accmap,
axes=tuple(range(a.ndim))))
return np.bincount(accmap.ravel(), a.ravel(), np.array(shape).max())
def accum(accmap, a, func=None, shape=None, fill_value=0, dtype=None):
"""An accumulation function similar to Matlab's `accumarray` function.
Parameters
----------
accmap : ndarray
This is the "accumulation map". It maps input (i.e. indices into
`a`) to their destination in the output array. The first `a.ndim`
dimensions of `accmap` must be the same as `a.shape`. That is,
`accmap.shape[:a.ndim]` must equal `a.shape`. For example, if `a`
has shape (15,4), then `accmap.shape[:2]` must equal (15,4). In this
case `accmap[i,j]` gives the index into the output array where
element (i,j) of `a` is to be accumulated. If the output is, say,
a 2D, then `accmap` must have shape (15,4,2). The value in the
last dimension give indices into the output array. If the output is
1D, then the shape of `accmap` can be either (15,4) or (15,4,1)
a : ndarray
The input data to be accumulated.
func : callable or None
The accumulation function. The function will be passed a list
of values from `a` to be accumulated.
If None, numpy.sum is assumed.
shape : ndarray or None
The shape of the output array. If None, the shape will be determined
from `accmap`.
fill_value : scalar
The default value for elements of the output array.
dtype : numpy data type, or None
The data type of the output array. If None, the data type of
`a` is used.
Returns
-------
out : ndarray
The accumulated results.
The shape of `out` is `shape` if `shape` is given. Otherwise the
shape is determined by the (lexicographically) largest indices of
the output found in `accmap`.
Examples
--------
>>> from numpy import array, prod
>>> a = array([[1,2,3],[4,-1,6],[-1,8,9]])
>>> a
array([[ 1, 2, 3],
[ 4, -1, 6],
[-1, 8, 9]])
>>> accmap = array([[0,1,2],[2,0,1],[1,2,0]]) # Sum the diagonals.
>>> s = accum(accmap, a)
>>> s
array([ 9, 7, 15])
# A 2D output, from sub-arrays with shapes and positions like this:
# [ (2,2) (2,1)]
# [ (1,2) (1,1)]
>>> accmap = array([
... [[0,0],[0,0],[0,1]],
... [[0,0],[0,0],[0,1]],
... [[1,0],[1,0],[1,1]]])
# Accumulate using a product.
>>> accum(accmap, a, func=prod, dtype=float)
array([[ -8., 18.],
[ -8., 9.]])
>>> accum(accmap, a, dtype=float)
array([[ 6., 9.],
[ 7., 9.]])
# Same accmap, but create an array of lists of values.
>>> t = accum(accmap, a, func=lambda x: x, dtype='O')
>>> np.allclose(t[0][0], [1, 2, 4, -1])
True
>>> np.allclose(t[0][1], [3, 6])
True
>>> np.allclose(t[1][0], [-1, 8])
True
>>> np.allclose(t[1][1], [9])
True
"""
def create_array_of_python_lists(accmap, a, shape):
vals = np.empty(shape, dtype='O')
for s in product(*[list(range(k)) for k in shape]):
vals[s] = []
for s in product(*[list(range(k)) for k in a.shape]):
indx = tuple(accmap[s])
val = a[s]
vals[indx].append(val)
return vals
def _initialize(accmap, a, func, shape, dtype):
"""Check for bad arguments and handle the defaults."""
_assert(accmap.shape[:a.ndim] == a.shape,
"The initial dimensions of accmap must be the same as a.shape")
if accmap.shape == a.shape:
accmap = np.expand_dims(accmap, -1)
if func is None:
func = np.sum
if dtype is None:
dtype = a.dtype
adims = tuple(range(a.ndim))
if shape is None:
shape = 1 + np.squeeze(np.apply_over_axes(np.max, accmap,
axes=adims))
shape = np.atleast_1d(shape)
return accmap, shape, dtype, func
accmap, shape, dtype, func = _initialize(accmap, a, func, shape, dtype)
# Create an array of python lists of values.
vals = create_array_of_python_lists(accmap, a, shape)
# Create the output array.
out = np.empty(shape, dtype=dtype)
for s in product(*[list(range(k)) for k in shape]):
if vals[s] == []:
out[s] = fill_value
else:
out[s] = func(vals[s])
return out
def _gridcount_nd(acfun, data, x, y, w, binx):
d, inc = x.shape
c_shape = np.repeat(inc, d)
nc = c_shape.prod() # inc ** d
c = np.zeros((nc, ))
fact2 = np.asarray(np.reshape(inc * np.arange(d), (d, -1)), dtype=int)
fact1 = np.asarray(np.reshape(c_shape.cumprod() / inc, (d, -1)), dtype=int)
bt0 = [0, 0]
x1 = x.ravel()
for ir in range(2 ** (d - 1)):
bt0[0] = np.reshape(bitget(ir, np.arange(d)), (d, -1))
bt0[1] = 1 - bt0[0]
for ix in range(2):
one = np.mod(ix, 2)
two = np.mod(ix + 1, 2)
# Convert to linear index
# linear index to c
b1 = np.sum((binx + bt0[one]) * fact1, axis=0)
bt2 = bt0[two] + fact2
b2 = binx + bt2 # linear index to X
c += acfun(b1, np.abs(np.prod(x1[b2] - data, axis=0)) * y,
shape=(nc, ))
c = np.reshape(c / w, c_shape, order='F')
T = list(range(d))
T[1], T[0] = T[0], T[1]
# make sure c is stored in the same way as meshgrid
c = c.transpose(*T)
return c
def _gridcount_1d(acfun, data, x, y, w, binx, inc):
x.shape = -1,
c = (acfun(binx, (x[binx + 1] - data) * y, shape=(inc, )) +
acfun(binx + 1, (data - x[binx]) * y, shape=(inc, ))).ravel()
return c / w
def gridcount(data, X, y=1):
'''
Returns D-dimensional histogram using linear binning.
Parameters
----------
data : array-like
column vectors with D-dimensional data, shape D x Nd
X : array-like
row vectors defining discretization in each dimension, shape D x N.
The discretization must include the range of the data.
y : array-like
response data. Scalar or vector of size Nd.
Returns
-------
c : ndarray
gridcount, shape N x N x ... x N
GRIDCOUNT obtains the grid counts using linear binning.
There are 2 strategies: simple- or linear- binning.
Suppose that an observation occurs at x and that the nearest point
below and above is y and z, respectively. Then simple binning strategy
assigns a unit weight to either y or z, whichever is closer. Linear
binning, on the other hand, assigns the grid point at y with the weight
of (z-x)/(z-y) and the gridpoint at z a weight of (y-x)/(z-y).
In terms of approximation error of using gridcounts as pdf-estimate,
linear binning is significantly more accurate than simple binning.
NOTE: The interval [min(X);max(X)] must include the range of the data.
The order of C is permuted in the same order as
meshgrid for D==2 or D==3.
Example
-------
>>> import numpy as np
>>> import wafo.kdetools as wk
>>> N = 20
>>> data = np.random.rayleigh(1,N)
>>> data = np.array(
... [ 1.07855907, 1.51199717, 1.54382893, 1.54774808, 1.51913566,
... 1.11386486, 1.49146216, 1.51127214, 2.61287913, 0.94793051,
... 2.08532731, 1.35510641, 0.56759888, 1.55766981, 0.77883602,
... 0.9135759 , 0.81177855, 1.02111483, 1.76334202, 0.07571454])
>>> x = np.linspace(0,max(data)+1,50)
>>> dx = x[1]-x[0]
>>> c = wk.gridcount(data, x)
>>> np.allclose(c[:5], [ 0., 0.9731147, 0.0268853, 0., 0.])
True
>>> pdf = c/dx/N
>>> np.allclose(np.trapz(pdf, x), 1)
True
import matplotlib.pyplot as plt
h = plt.plot(x,c,'.') # 1D histogram
h1 = plt.plot(x, pdf) # 1D probability density plot
See also
--------
bincount, accum, kde
Reference
----------
Wand,M.P. and Jones, M.C. (1995)
'Kernel smoothing'
Chapman and Hall, pp 182-192
'''
dataset, x = np.atleast_2d(data, X)
y = np.atleast_1d(y).ravel()
d, inc = x.shape
_assert(d == dataset.shape[0], 'Dimension 0 of data and X do not match.')
xlo, xup = x[:, 0], x[:, -1]
datlo, datup = dataset.min(axis=1), dataset.max(axis=1)
_assert(not ((datlo < xlo) | (xup < datup)).any(),
'X does not include whole range of the data!')
# acfun = accumsum_sparse # faster than accum
acfun = accumsum # faster than accumsum_sparse
dx = np.diff(x[:, :2], axis=1)
binx = np.asarray(np.floor((dataset - xlo[:, np.newaxis]) / dx), dtype=int)
w = dx.prod()
if d == 1:
return _gridcount_1d(acfun, dataset, x, y, w, binx, inc)
return _gridcount_nd(acfun, dataset, x, y, w, binx)
if __name__ == '__main__':
from wafo.testing import test_docstrings
test_docstrings(__file__)