Made a baseclass _KDE for KDE and TKDE + updated tests

master
Per.Andreas.Brodtkorb 14 years ago
parent f8c543b94e
commit 171a0cd0c6

@ -57,9 +57,172 @@ def sphere_volume(d, r=1.0):
'Kernel smoothing'
Chapman and Hall, pp 105
"""
return (r ** d) * 2. * pi ** (d / 2.) / (d * gamma(d / 2.))
return (r ** d) * 2.0 * pi ** (d / 2.0) / (d * gamma(d / 2.0))
class TKDE(object):
class _KDE(object):
""" Kernel-Density Estimator base class.
Parameters
----------
data : (# of dims, # of data)-array
datapoints to estimate from
hs : array-like (optional)
smooting parameter vector/matrix.
(default compute from data using kernel.get_smoothing function)
kernel : kernel function object.
kernel must have get_smoothing method
alpha : real scalar (optional)
sensitivity parameter (default 0 regular KDE)
A good choice might be alpha = 0.5 ( or 1/D)
alpha = 0 Regular KDE (hs is constant)
0 < alpha <= 1 Adaptive KDE (Make hs change)
Members
-------
d : int
number of dimensions
n : int
number of datapoints
Methods
-------
kde.eval_grid_fast(x0, x1,..., xd) : array
evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
kde.eval_grid(x0, x1,..., xd) : array
evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
kde.eval_points(points) : array
evaluate the estimated pdf on a provided set of points
kde(x0, x1,..., xd) : array
same as kde.eval_grid(x0, x1,..., xd)
"""
def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None, xmax=None, inc=128):
self.dataset = atleast_2d(data)
self.hs = hs
self.kernel = kernel if kernel else Kernel('gauss')
self.alpha = alpha
self.xmin = xmin
self.xmax = xmax
self.inc = inc
self.initialize()
def initialize(self):
self.d, self.n = self.dataset.shape
self._set_xlimits()
self._initialize()
def _initialize(self):
pass
def _set_xlimits(self):
amin = self.dataset.min(axis= -1)
amax = self.dataset.max(axis= -1)
iqr = iqrange(self.dataset, axis=-1)
sigma = np.minimum(np.std(self.dataset, axis=-1, ddof=1),iqr/1.34)
#xyzrange = amax - amin
#offset = xyzrange / 4.0
offset = 2*sigma
if self.xmin is None:
self.xmin = amin - offset
else:
self.xmin = self.xmin * np.ones(self.d)
if self.xmax is None:
self.xmax = amax + offset
else:
self.xmax = self.xmax * np.ones(self.d)
def eval_grid_fast(self, *args):
"""Evaluate the estimated pdf on a grid.
Parameters
----------
arg_0,arg_1,... arg_d-1 : vectors
Alternatively, if no vectors is passed in then
arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
Returns
-------
values : array-like
The values evaluated at meshgrid(*args).
"""
if len(args) == 0:
args = []
for i in range(self.d):
args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
self.args = args
return self._eval_grid_fast(*args)
def _eval_grid_fast(self, *args):
pass
def eval_grid(self, *args):
"""Evaluate the estimated pdf on a grid.
Parameters
----------
arg_0,arg_1,... arg_d-1 : vectors
Alternatively, if no vectors is passed in then
arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
Returns
-------
values : array-like
The values evaluated at meshgrid(*args).
"""
if len(args) == 0:
args = []
for i in range(self.d):
args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
self.args = args
return self._eval_grid(*args)
def _eval_grid(self, *args):
pass
def _check_shape(self, points):
points = atleast_2d(points)
d, m = points.shape
if d != self.d:
if d == 1 and m == self.d:
# points was passed in as a row vector
points = np.reshape(points, (self.d, 1))
else:
msg = "points have dimension %s, dataset has dimension %s" % (d,
self.d)
raise ValueError(msg)
return points
def eval_points(self, points):
"""Evaluate the estimated pdf on a set of points.
Parameters
----------
points : (# of dimensions, # of points)-array
Alternatively, a (# of dimensions,) vector can be passed in and
treated as a single point.
Returns
-------
values : (# of points,)-array
The values at each point.
Raises
------
ValueError if the dimensionality of the input points is different than
the dimensionality of the KDE.
"""
points = self._check_shape(points)
return self._eval_points(points)
def _eval_points(self, points):
pass
__call__ = eval_grid
class TKDE(_KDE):
""" Transformation Kernel-Density Estimator.
Parameters
@ -76,6 +239,17 @@ class TKDE(object):
A good choice might be alpha = 0.5 ( or 1/D)
alpha = 0 Regular KDE (hs is constant)
0 < alpha <= 1 Adaptive KDE (Make hs change)
xmin, xmax : vectors
specifying the default argument range for the kde.eval_grid methods.
For the kde.eval_grid_fast methods the values must cover the range of the data.
(default min(data)-range(data)/4, max(data)-range(data)/4)
If a single value of xmin or xmax is given then the boundary is the is
the same for all dimensions.
inc : scalar integer
defining the default dimension of the output from kde.eval_grid methods (default 128)
(For kde.eval_grid_fast: A value below 50 is very fast to compute but
may give some inaccuracies. Values between 100 and 500 give very
accurate results)
L2 : array-like
vector of transformation parameters (default 1 no transformation)
t(xi;L2) = xi^L2*sign(L2) for L2(i) ~= 0
@ -91,10 +265,14 @@ class TKDE(object):
Methods
-------
kde.evaluate(points) : array
kde.eval_grid_fast(x0, x1,..., xd) : array
evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
kde.eval_grid(x0, x1,..., xd) : array
evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
kde.eval_points(points) : array
evaluate the estimated pdf on a provided set of points
kde(points) : array
same as kde.evaluate(points)
kde(x0, x1,..., xd) : array
same as kde.eval_grid(x0, x1,..., xd)
Example
@ -119,8 +297,8 @@ class TKDE(object):
0.20717946, 0.15907684, 0.1201074 , 0.08941027, 0.06574882])
>>> kde.eval_grid_fast(x)
array([ 0. , 0.4614821 , 0.39554839, 0.32764086, 0.26275681,
0.20543731, 0.15741056, 0.11863464, 0. , 0. ])
array([ 1.06437223, 0.46203314, 0.39593137, 0.32781899, 0.26276433,
0.20532206, 0.15723498, 0.11843998, 0.08797755, 0. ])
import pylab as plb
h1 = plb.plot(x, f) # 1D probability density plot
@ -129,19 +307,11 @@ class TKDE(object):
def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None,
xmax=None, inc=128, L2=None):
self.dataset = atleast_2d(data)
self.hs = hs
self.kernel = kernel if kernel else Kernel('gauss')
self.alpha = alpha
self.xmin = xmin
self.xmax = xmax
self.inc = inc
self.L2 = L2
self.d, self.n = self.dataset.shape
self.initialize()
_KDE.__init__(self, data, hs, kernel, alpha, xmin, xmax, inc)
def initialize(self):
self._set_xlimits()
def _initialize(self):
self._check_xmin()
tdataset = self._dat2gaus(self.dataset)
xmin = self.xmin
if xmin is not None:
@ -151,38 +321,11 @@ class TKDE(object):
xmax = self._dat2gaus(xmax)
self.tkde = KDE(tdataset, self.hs, self.kernel, self.alpha, xmin, xmax,
self.inc)
def _set_xlimits(self):
amin = self.dataset.min(axis=-1)
amax = self.dataset.max(axis=-1)
xyzrange = amax-amin
offset = xyzrange/4.0
if self.xmin is None:
self.xmin = amin - offset
else:
self.xmin = self.xmin * np.ones(self.d)
if self.xmax is None:
self.xmax = amax + offset
else:
self.xmax = self.xmax * np.ones(self.d)
def _check_xmin(self):
if self.L2 is not None:
amin = self.dataset.min(axis= -1)
L2 = np.atleast_1d(self.L2) * np.ones(self.d) # default no transformation
self.xmin = np.where(L2!=1, np.maximum(self.xmin, amin/2.0), self.xmin)
def _check_shape(self, points):
points = atleast_2d(points)
d, m = points.shape
if d != self.d:
if d == 1 and m == self.d:
# points was passed in as a row vector
points = np.reshape(points, (self.d, 1))
m = 1
else:
msg = "points have dimension %s, dataset has dimension %s" % (d,
self.d)
raise ValueError(msg)
return points
self.xmin = np.where(L2 != 1, np.maximum(self.xmin, amin / 100.0), self.xmin)
def _dat2gaus(self, points):
if self.L2 is None:
@ -218,14 +361,15 @@ class TKDE(object):
transformation. Check the KDE for spurious spikes'''
warnings.warn(msg)
return pdf
def eval_grid_fast(self, *args):
def eval_grid_fast2(self, *args):
"""Evaluate the estimated pdf on a grid.
Parameters
----------
arg_0,arg_1,... arg_d-1 : vectors
Alternatively, if no vectors is passed in then
arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
arg_i = gauss2dat(linspace(dat2gauss(self.xmin[i]), dat2gauss(self.xmax[i]), self.inc))
Returns
-------
@ -233,7 +377,9 @@ class TKDE(object):
The values evaluated at meshgrid(*args).
"""
return self._eval_grid_fast(*args)
def _eval_grid_fast(self, *args):
if self.L2 is None:
f = self.tkde.eval_grid_fast(*args)
self.args = self.tkde.args
@ -253,26 +399,7 @@ class TKDE(object):
#fi.shape = ipoints[0].shape
return fi
return f
def eval_grid(self, *args):
"""Evaluate the estimated pdf on a grid.
Parameters
----------
arg_0,arg_1,... arg_d-1 : vectors
Alternatively, if no vectors is passed in then
arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
Returns
-------
values : array-like
The values evaluated at meshgrid(*args).
"""
if len(args)==0:
args = []
for i in range(self.d):
args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
self.args = args
def _eval_grid(self, *args):
if self.L2 is None:
return self.tkde.eval_grid(*args)
targs = self._dat2gaus(list(args))
@ -281,8 +408,7 @@ class TKDE(object):
f = self._scale_pdf(tf, points)
return f
return self.tkde.eval_grid(*args)
def evaluate(self, points):
def _eval_points(self, points):
"""Evaluate the estimated pdf on a set of points.
Parameters
@ -302,16 +428,14 @@ class TKDE(object):
the dimensionality of the KDE.
"""
if self.L2 is None:
return self.tkde(points)
points = self._check_shape(points)
return self.tkde.eval_points(points)
tpoints = self._dat2gaus(points)
tf = self.tkde(tpoints)
tf = self.tkde.eval_points(tpoints)
f = self._scale_pdf(tf, points)
return f
__call__ = evaluate
class KDE(object):
class KDE(_KDE):
""" Kernel-Density Estimator.
Parameters
@ -328,7 +452,17 @@ class KDE(object):
A good choice might be alpha = 0.5 ( or 1/D)
alpha = 0 Regular KDE (hs is constant)
0 < alpha <= 1 Adaptive KDE (Make hs change)
xmin, xmax : vectors
specifying the default argument range for the kde.eval_grid methods.
For the kde.eval_grid_fast methods the values must cover the range of the data.
(default min(data)-range(data)/4, max(data)-range(data)/4)
If a single value of xmin or xmax is given then the boundary is the is
the same for all dimensions.
inc : scalar integer
defining the default dimension of the output from kde.eval_grid methods (default 128)
(For kde.eval_grid_fast: A value below 50 is very fast to compute but
may give some inaccuracies. Values between 100 and 500 give very
accurate results)
Members
-------
@ -339,10 +473,14 @@ class KDE(object):
Methods
-------
kde.evaluate(points) : array
kde.eval_grid_fast(x0, x1,..., xd) : array
evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
kde.eval_grid(x0, x1,..., xd) : array
evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
kde.eval_points(points) : array
evaluate the estimated pdf on a provided set of points
kde(points) : array
same as kde.evaluate(points)
kde(x0, x1,..., xd) : array
same as kde.eval_grid(x0, x1,..., xd)
Example
@ -367,7 +505,7 @@ class KDE(object):
0.21409279, 0.12738463, 0.07460326, 0.03956191, 0.01887164])
>>> kde0 = wk.KDE(data, hs=0.5, alpha=0.0)
>>> kde0.evaluate(x)
>>> kde0.eval_points(x)
array([ 0.2039735 , 0.40252503, 0.54595078, 0.52219649, 0.3906213 ,
0.26381501, 0.16407362, 0.08270612, 0.02991145, 0.00720821])
@ -377,8 +515,8 @@ class KDE(object):
>>> f = kde0.eval_grid_fast()
>>> np.interp(x, kde0.args[0], f)
array([ 0.21165996, 0.41218257, 0.54961961, 0.51713209, 0.38292245,
0.25864661, 0.16113184, 0.08055992, 0.03576856, 0.03576856])
array([ 0.21227584, 0.41256459, 0.5495661 , 0.5176579 , 0.38431616,
0.2591162 , 0.15978948, 0.07889179, 0.02769818, 0.00791829])
import pylab as plb
h1 = plb.plot(x, f) # 1D probability density plot
@ -386,41 +524,18 @@ class KDE(object):
"""
def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None, xmax=None, inc=128):
self.kernel = kernel if kernel else Kernel('gauss')
self.hs = hs
self.alpha = alpha
self.dataset = atleast_2d(data)
self.d, self.n = self.dataset.shape
self.xmin = xmin
self.xmax = xmax
self.inc = inc
self.initialize()
_KDE.__init__(self, data, hs, kernel, alpha, xmin, xmax, inc)
def initialize(self):
self._set_xlimits()
def _initialize(self):
self._compute_smoothing()
if self.alpha > 0:
pilot = KDE(self.dataset, hs=self.hs, kernel=self.kernel, alpha=0)
f = pilot(self.dataset) # get a pilot estimate by regular KDE (alpha=0)
f = pilot.eval_points(self.dataset) # get a pilot estimate by regular KDE (alpha=0)
g = np.exp(np.mean(np.log(f)))
self._lambda = (f / g) ** (-self.alpha)
else:
self._lambda = np.ones(self.n)
def _set_xlimits(self):
amin = self.dataset.min(axis=-1)
amax = self.dataset.max(axis=-1)
xyzrange = amax-amin
if self.xmin is None:
self.xmin = amin-xyzrange/4.0
else:
self.xmin = self.xmin * np.ones(self.d)
if self.xmax is None:
self.xmax = amax + xyzrange/4.0
else:
self.xmax = self.xmax * np.ones(self.d)
def _compute_smoothing(self):
"""Computes the smoothing matrix
"""
@ -451,27 +566,7 @@ class KDE(object):
self.hs = h
self._norm_factor = deth * self.n
def eval_grid_fast(self, *args):
"""Evaluate the estimated pdf on a grid.
Parameters
----------
arg_0,arg_1,... arg_d-1 : vectors
Alternatively, if no vectors is passed in then
arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
Returns
-------
values : array-like
The values evaluated at meshgrid(*args).
"""
if len(args)==0:
args = []
for i in range(self.d):
args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
self.args = args
return self._eval_grid_fast(*args)
def _eval_grid_fast(self, *args):
# TODO: This does not work correctly yet! Check it.
X = np.vstack(args)
@ -509,29 +604,6 @@ class KDE(object):
ix = (slice(0, inc),)*d
return z[ix] * (z[ix] > 0.0)
def eval_grid(self, *args):
"""Evaluate the estimated pdf on a grid.
Parameters
----------
arg_0,arg_1,... arg_d-1 : vectors
Alternatively, if no vectors is passed in then
arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
Returns
-------
values : array-like
The values evaluated at meshgrid(*args).
"""
if len(args)==0:
args = []
for i in range(self.d):
args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
self.args = args
return self._eval_grid(*args)
def _eval_grid(self, *args):
grd = meshgrid(*args) if len(args) > 1 else list(args)
@ -539,23 +611,11 @@ class KDE(object):
d = len(grd)
for i in range(d):
grd[i] = grd[i].ravel()
f = self.evaluate(np.vstack(grd))
f = self.eval_points(np.vstack(grd))
return f.reshape(shape0)
def _check_shape(self, points):
points = atleast_2d(points)
d, m = points.shape
if d != self.d:
if d == 1 and m == self.d:
# points was passed in as a row vector
points = np.reshape(points, (self.d, 1))
m = 1
else:
msg = "points have dimension %s, dataset has dimension %s" % (d,
self.d)
raise ValueError(msg)
return points
def evaluate(self, points):
def _eval_points(self, points):
"""Evaluate the estimated pdf on a set of points.
Parameters
@ -574,8 +634,6 @@ class KDE(object):
ValueError if the dimensionality of the input points is different than
the dimensionality of the KDE.
"""
points = self._check_shape(points)
d, m = points.shape
result = np.zeros((m,))
@ -598,8 +656,6 @@ class KDE(object):
return result
__call__ = evaluate
class _Kernel(object):
def __init__(self, r=1.0, stats=None):
@ -898,7 +954,7 @@ class Kernel(object):
# R= int(mkernel(x)^2), mu2= int(x^2*mkernel(x))
mu2, R, Rdd = self.stats()
AMISEconstant = (8 * sqrt(pi) * R / (3 * mu2 ** 2 * n)) ** (1. / 5)
iqr = np.abs(np.percentile(A, 75, axis=1) - np.percentile(A, 25, axis=1))# interquartile range
iqr = iqrange(A, axis=1) # interquartile range
stdA = np.std(A, axis=1, ddof=1)
# % use of interquartile range guards against outliers.
# % the use of interquartile range is better if
@ -1068,7 +1124,7 @@ class Kernel(object):
ax1 = amin - arange / 8.0
bx1 = amax + arange / 8.0
kernel2 = Kernel('gaus')
kernel2 = Kernel('gauss')
mu2, R, Rdd = kernel2.stats()
STEconstant2 = R / (mu2 ** (2) * n)
fft = np.fft.fft
@ -1142,9 +1198,9 @@ class Kernel(object):
def norm_factor(self, d=1, n=None):
return self.kernel.norm_factor(d, n)
def evaluate(self, X):
return self.kernel(np.atleast_2d(X))
__call__ = evaluate
def eval_points(self, points):
return self.kernel(np.atleast_2d(points))
__call__ = eval_points
def mkernel(X, kernel):
'''
@ -1297,6 +1353,39 @@ def accum(accmap, a, func=None, size=None, fill_value=0, dtype=None):
return out
def iqrange(data, axis=None):
'''
Returns the Inter Quartile Range of data
Parameters
----------
data : array-like
Input array or object that can be converted to an array.
axis : {None, int}, optional
Axis along which the percentiles are computed. The default (axis=None)
is to compute the median along a flattened version of the array.
Returns
-------
r : array-like
abs(np.percentile(data, 75, axis)-np.percentile(data, 25, axis))
Notes
-----
IQRANGE is a robust measure of spread. The use of interquartile range
guards against outliers if the distribution have heavy tails.
Example
-------
>>> a = np.arange(101)
>>> iqrange(a)
50.0
See also
--------
np.std
'''
return np.abs(np.percentile(data, 75, axis=axis)-np.percentile(data, 25, axis=axis))
def bitget(int_type, offset):
'''
Returns the value of the bit at the offset position in int_type.

@ -25,14 +25,16 @@ def test0_KDE1D():
>>> kde0.eval_grid(x)
array([ 0.2039735 , 0.40252503, 0.54595078, 0.52219649, 0.3906213 ,
0.26381501, 0.16407362, 0.08270612, 0.02991145, 0.00720821])
>>> kde0.eval_grid_fast(x)
array([ 0.32343789, 0.51366167, 0.55643329, 0.43688805, 0.28972471,
0.19445277, 0.12473331, 0.06195215, 0.02087712, 0.00449567])
>>> f = kde0.eval_grid_fast(); f
array([ 0.07264948, 0.14135253, 0.24141397, 0.36045498, 0.46962192,
0.53604004, 0.5427015 , 0.49767387, 0.42419428, 0.34349993,
0.26650289, 0.19666903, 0.13569857, 0.0857818 , 0.04868357,
0.02432961])
array([ 0.02076721, 0.0612371 , 0.14515308, 0.27604202, 0.42001793,
0.51464781, 0.52131018, 0.45976136, 0.37621768, 0.29589521,
0.21985316, 0.1473364 , 0.08502256, 0.04063749, 0.0155788 ,
0.00466938])
>>> np.trapz(f,kde0.args)
array([ 0.97384215])
array([ 0.99416766])
'''
def test1_TKDE1D():
'''

Loading…
Cancel
Save