Made a baseclass _KDE for KDE and TKDE + updated tests

15 years ago · 171a0cd0c6
parent f8c543b94e
commit 171a0cd0c6
2 changed files with 353 additions and 262 deletions
--- a/pywafo/src/wafo/kdetools.py
+++ b/pywafo/src/wafo/kdetools.py
@ -57,9 +57,172 @@ def sphere_volume(d, r=1.0):
    'Kernel smoothing'
    Chapman and Hall, pp 105
    """
-    return (r ** d) * 2. * pi ** (d / 2.) / (d * gamma(d / 2.))
+    return (r ** d) * 2.0 * pi ** (d / 2.0) / (d * gamma(d / 2.0))
-class TKDE(object):
+class _KDE(object):
    """ Kernel-Density Estimator base class.
    Parameters
    ----------
    data : (# of dims, # of data)-array
        datapoints to estimate from
    hs : array-like (optional) 
        smooting parameter vector/matrix.
        (default compute from data using kernel.get_smoothing function)
    kernel :  kernel function object.
        kernel must have get_smoothing method
    alpha : real scalar (optional)
        sensitivity parameter               (default 0 regular KDE)
        A good choice might be alpha = 0.5 ( or 1/D)
        alpha = 0      Regular  KDE (hs is constant)
        0 < alpha <= 1 Adaptive KDE (Make hs change)  
    Members
    -------
    d : int
        number of dimensions
    n : int
        number of datapoints
    Methods
    -------
    kde.eval_grid_fast(x0, x1,..., xd) : array
        evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
    kde.eval_grid(x0, x1,..., xd) : array
        evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
    kde.eval_points(points) : array
        evaluate the estimated pdf on a provided set of points
    kde(x0, x1,..., xd) : array
        same as kde.eval_grid(x0, x1,..., xd)
    """
    def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None, xmax=None, inc=128):
        self.dataset = atleast_2d(data)
        self.hs = hs
        self.kernel = kernel if kernel else Kernel('gauss')
        self.alpha = alpha
        self.xmin = xmin
        self.xmax = xmax
        self.inc = inc
        self.initialize()
    def initialize(self):
        self.d, self.n = self.dataset.shape
        self._set_xlimits()
        self._initialize()
    def _initialize(self):
        pass
    def _set_xlimits(self):
        amin = self.dataset.min(axis= -1)
        amax = self.dataset.max(axis= -1)
        iqr = iqrange(self.dataset, axis=-1)
        sigma = np.minimum(np.std(self.dataset, axis=-1, ddof=1),iqr/1.34)
        #xyzrange = amax - amin
        #offset = xyzrange / 4.0
        offset  = 2*sigma
        if self.xmin is None:
            self.xmin = amin - offset
        else:
            self.xmin = self.xmin * np.ones(self.d)
        if self.xmax is None:
            self.xmax = amax + offset
        else:
            self.xmax = self.xmax * np.ones(self.d)
    def eval_grid_fast(self, *args):
        """Evaluate the estimated pdf on a grid.
        Parameters
        ----------
        arg_0,arg_1,... arg_d-1 : vectors
            Alternatively, if no vectors is passed in then
             arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
        Returns
        -------
        values : array-like
            The values evaluated at meshgrid(*args).
        """
        if len(args) == 0:
            args = []
            for i in range(self.d):
                args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
        self.args = args
        return self._eval_grid_fast(*args)
    def _eval_grid_fast(self, *args):
        pass
    def eval_grid(self, *args):
        """Evaluate the estimated pdf on a grid.
        Parameters
        ----------
        arg_0,arg_1,... arg_d-1 : vectors
            Alternatively, if no vectors is passed in then
             arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
        Returns
        -------
        values : array-like
            The values evaluated at meshgrid(*args).
        """
        if len(args) == 0:
            args = []
            for i in range(self.d):
                args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
        self.args = args
        return self._eval_grid(*args)
    def _eval_grid(self, *args):
        pass
    def _check_shape(self, points):
        points = atleast_2d(points)
        d, m = points.shape
        if d != self.d:
            if d == 1 and m == self.d:
                # points was passed in as a row vector
                points = np.reshape(points, (self.d, 1))
            else:
                msg = "points have dimension %s, dataset has dimension %s" % (d,
                    self.d)
                raise ValueError(msg)
        return points   
    def eval_points(self, points):
        """Evaluate the estimated pdf on a set of points.
        Parameters
        ----------
        points : (# of dimensions, # of points)-array
            Alternatively, a (# of dimensions,) vector can be passed in and
            treated as a single point.
        Returns
        -------
        values : (# of points,)-array
            The values at each point.
        Raises
        ------
        ValueError if the dimensionality of the input points is different than
        the dimensionality of the KDE.
        """
        points = self._check_shape(points)
        return self._eval_points(points)
    def _eval_points(self, points):
        pass
    __call__ = eval_grid
 class TKDE(_KDE):
    """ Transformation Kernel-Density Estimator.
    Parameters
@ -76,6 +239,17 @@ class TKDE(object):
        A good choice might be alpha = 0.5 ( or 1/D)
        alpha = 0      Regular  KDE (hs is constant)
        0 < alpha <= 1 Adaptive KDE (Make hs change)
    xmin, xmax  : vectors 
        specifying the default argument range for the kde.eval_grid methods. 
        For the kde.eval_grid_fast methods the values must cover the range of the data. 
        (default min(data)-range(data)/4, max(data)-range(data)/4)
        If a single value of xmin or xmax is given then the boundary is the is 
        the same for all dimensions.
    inc :  scalar integer
        defining the default dimension of the output from kde.eval_grid methods (default 128)
        (For kde.eval_grid_fast: A value below 50 is very fast to compute but 
        may give some inaccuracies. Values between 100 and 500 give very 
        accurate results)  
    L2 : array-like 
        vector of transformation parameters (default 1 no transformation)
        t(xi;L2) = xi^L2*sign(L2)   for L2(i) ~= 0
@ -91,10 +265,14 @@ class TKDE(object):
    Methods
    -------
-    kde.evaluate(points) : array
+    kde.eval_grid_fast(x0, x1,..., xd) : array
        evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
    kde.eval_grid(x0, x1,..., xd) : array
        evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
    kde.eval_points(points) : array
        evaluate the estimated pdf on a provided set of points
-    kde(points) : array
+    kde(x0, x1,..., xd) : array
-        same as kde.evaluate(points)
+        same as kde.eval_grid(x0, x1,..., xd)
    Example
@ -119,8 +297,8 @@ class TKDE(object):
            0.20717946,  0.15907684,  0.1201074 ,  0.08941027,  0.06574882])
    >>> kde.eval_grid_fast(x)
-    array([ 0.        ,  0.4614821 ,  0.39554839,  0.32764086,  0.26275681,
+    array([ 1.06437223,  0.46203314,  0.39593137,  0.32781899,  0.26276433,
-            0.20543731,  0.15741056,  0.11863464,  0.        ,  0.        ])
+            0.20532206,  0.15723498,  0.11843998,  0.08797755,  0.        ])
    import pylab as plb          
    h1 = plb.plot(x, f) #  1D probability density plot
@ -129,19 +307,11 @@ class TKDE(object):
    def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None,
                 xmax=None, inc=128, L2=None):
        self.dataset = atleast_2d(data)
        self.hs = hs
        self.kernel = kernel if kernel else Kernel('gauss')
        self.alpha = alpha
        self.xmin = xmin
        self.xmax = xmax
        self.inc = inc
        self.L2 = L2
-        self.d, self.n = self.dataset.shape
+        _KDE.__init__(self, data, hs, kernel, alpha, xmin, xmax, inc)
        self.initialize()
-    def initialize(self):
+    def _initialize(self):
-        self._set_xlimits()
+        self._check_xmin()
        tdataset = self._dat2gaus(self.dataset)
        xmin = self.xmin
        if xmin is not None:
@ -151,38 +321,11 @@ class TKDE(object):
            xmax = self._dat2gaus(xmax)
        self.tkde = KDE(tdataset, self.hs, self.kernel, self.alpha, xmin, xmax,
                       self.inc)
-    def _set_xlimits(self):
+    def _check_xmin(self):
        amin = self.dataset.min(axis=-1)
        amax = self.dataset.max(axis=-1)
        xyzrange = amax-amin
        offset = xyzrange/4.0
        if self.xmin is None:
            self.xmin = amin - offset
        else:
            self.xmin = self.xmin * np.ones(self.d)
        if self.xmax is None:
            self.xmax = amax + offset
        else:
            self.xmax = self.xmax * np.ones(self.d)
        if self.L2 is not None:
            amin = self.dataset.min(axis= -1)
            L2 = np.atleast_1d(self.L2) * np.ones(self.d) # default no transformation
-            self.xmin = np.where(L2!=1, np.maximum(self.xmin, amin/2.0), self.xmin)
+            self.xmin = np.where(L2 != 1, np.maximum(self.xmin, amin / 100.0), self.xmin)
    def _check_shape(self, points):
        points = atleast_2d(points)
        d, m = points.shape
        if d != self.d:
            if d == 1 and m == self.d:
                # points was passed in as a row vector
                points = np.reshape(points, (self.d, 1))
                m = 1
            else:
                msg = "points have dimension %s, dataset has dimension %s" % (d,
                    self.d)
                raise ValueError(msg)
        return points   
    def _dat2gaus(self, points):
        if self.L2 is None:
@ -218,14 +361,15 @@ class TKDE(object):
                    transformation. Check the KDE for spurious spikes'''
            warnings.warn(msg)
        return pdf
-    def eval_grid_fast(self, *args):
+    
    def eval_grid_fast2(self, *args):
        """Evaluate the estimated pdf on a grid.
        Parameters
        ----------
        arg_0,arg_1,... arg_d-1 : vectors
           Alternatively, if no vectors is passed in then
-             arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
+            arg_i = gauss2dat(linspace(dat2gauss(self.xmin[i]), dat2gauss(self.xmax[i]), self.inc))
        Returns
        -------
@ -233,7 +377,9 @@ class TKDE(object):
           The values evaluated at meshgrid(*args).
        """
        return self._eval_grid_fast(*args)
    def _eval_grid_fast(self, *args): 
        if self.L2 is None:
            f = self.tkde.eval_grid_fast(*args)
            self.args = self.tkde.args
@ -253,26 +399,7 @@ class TKDE(object):
            #fi.shape = ipoints[0].shape
            return fi
        return f
-    def eval_grid(self, *args):
+    def _eval_grid(self, *args):
        """Evaluate the estimated pdf on a grid.
        Parameters
        ----------
        arg_0,arg_1,... arg_d-1 : vectors
            Alternatively, if no vectors is passed in then
             arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
        Returns
        -------
        values : array-like
            The values evaluated at meshgrid(*args).
        """
        if len(args)==0:
            args = []
            for i in range(self.d):
                args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
        self.args = args
        if self.L2 is None:
            return self.tkde.eval_grid(*args)
        targs = self._dat2gaus(list(args))
@ -281,8 +408,7 @@ class TKDE(object):
        f = self._scale_pdf(tf, points)
        return f
-        return self.tkde.eval_grid(*args)
+    def _eval_points(self, points):
    def evaluate(self, points):
        """Evaluate the estimated pdf on a set of points.
        Parameters
@ -302,16 +428,14 @@ class TKDE(object):
        the dimensionality of the KDE.
        """
        if self.L2 is None:
-            return self.tkde(points)
+            return self.tkde.eval_points(points)
-        points = self._check_shape(points)
+        
        tpoints = self._dat2gaus(points)
-        tf = self.tkde(tpoints)
+        tf = self.tkde.eval_points(tpoints)
        f = self._scale_pdf(tf, points)
        return f
-    __call__ = evaluate
+class KDE(_KDE):
 class KDE(object):
    """ Kernel-Density Estimator.
    Parameters
@ -328,7 +452,17 @@ class KDE(object):
        A good choice might be alpha = 0.5 ( or 1/D)
        alpha = 0      Regular  KDE (hs is constant)
        0 < alpha <= 1 Adaptive KDE (Make hs change)  
-
+    xmin, xmax  : vectors 
        specifying the default argument range for the kde.eval_grid methods. 
        For the kde.eval_grid_fast methods the values must cover the range of the data. 
        (default min(data)-range(data)/4, max(data)-range(data)/4)
        If a single value of xmin or xmax is given then the boundary is the is 
        the same for all dimensions.
    inc :  scalar integer
        defining the default dimension of the output from kde.eval_grid methods (default 128)
        (For kde.eval_grid_fast: A value below 50 is very fast to compute but 
        may give some inaccuracies. Values between 100 and 500 give very 
        accurate results)  
    Members
    -------
@ -339,10 +473,14 @@ class KDE(object):
    Methods
    -------
-    kde.evaluate(points) : array
+    kde.eval_grid_fast(x0, x1,..., xd) : array
        evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
    kde.eval_grid(x0, x1,..., xd) : array
        evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
    kde.eval_points(points) : array
        evaluate the estimated pdf on a provided set of points
-    kde(points) : array
+    kde(x0, x1,..., xd) : array
-        same as kde.evaluate(points)
+        same as kde.eval_grid(x0, x1,..., xd)
    Example
@ -367,7 +505,7 @@ class KDE(object):
            0.21409279,  0.12738463,  0.07460326,  0.03956191,  0.01887164])
    >>> kde0 = wk.KDE(data, hs=0.5, alpha=0.0)
-    >>> kde0.evaluate(x)
+    >>> kde0.eval_points(x)
    array([ 0.2039735 ,  0.40252503,  0.54595078,  0.52219649,  0.3906213 ,
            0.26381501,  0.16407362,  0.08270612,  0.02991145,  0.00720821])
@ -377,8 +515,8 @@ class KDE(object):
    >>> f = kde0.eval_grid_fast()
    >>> np.interp(x, kde0.args[0], f)
-    array([ 0.21165996,  0.41218257,  0.54961961,  0.51713209,  0.38292245,
+    array([ 0.21227584,  0.41256459,  0.5495661 ,  0.5176579 ,  0.38431616,
-            0.25864661,  0.16113184,  0.08055992,  0.03576856,  0.03576856])
+            0.2591162 ,  0.15978948,  0.07889179,  0.02769818,  0.00791829])
    import pylab as plb          
    h1 = plb.plot(x, f) #  1D probability density plot
@ -386,41 +524,18 @@ class KDE(object):
    """
    def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None, xmax=None, inc=128):
-        self.kernel = kernel if kernel else Kernel('gauss')
+        _KDE.__init__(self, data, hs, kernel, alpha, xmin, xmax, inc)
        self.hs = hs
        self.alpha = alpha
        self.dataset = atleast_2d(data)
        self.d, self.n = self.dataset.shape
        self.xmin = xmin
        self.xmax = xmax
        self.inc = inc
        self.initialize()
-    def initialize(self):
+    def _initialize(self):
        self._set_xlimits()
        self._compute_smoothing()
        if self.alpha > 0:
            pilot = KDE(self.dataset, hs=self.hs, kernel=self.kernel, alpha=0)
-            f = pilot(self.dataset) # get a pilot estimate by regular KDE (alpha=0)
+            f = pilot.eval_points(self.dataset) # get a pilot estimate by regular KDE (alpha=0)
            g = np.exp(np.mean(np.log(f)))
            self._lambda = (f / g) ** (-self.alpha)
        else:
            self._lambda = np.ones(self.n)
    def _set_xlimits(self):
        amin = self.dataset.min(axis=-1)
        amax = self.dataset.max(axis=-1)
        xyzrange = amax-amin
        if self.xmin is None:
            self.xmin = amin-xyzrange/4.0
        else:
            self.xmin = self.xmin * np.ones(self.d)
        if self.xmax is None:
            self.xmax = amax + xyzrange/4.0
        else:
            self.xmax = self.xmax * np.ones(self.d)
    def _compute_smoothing(self):
        """Computes the smoothing matrix
        """
@ -451,27 +566,7 @@ class KDE(object):
        self.hs = h
        self._norm_factor = deth * self.n
    def eval_grid_fast(self, *args):
        """Evaluate the estimated pdf on a grid.
        Parameters
        ----------
        arg_0,arg_1,... arg_d-1 : vectors
            Alternatively, if no vectors is passed in then
             arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
        Returns
        -------
        values : array-like
            The values evaluated at meshgrid(*args).
        """
        if len(args)==0:
            args = []
            for i in range(self.d):
                args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
        self.args = args
        return self._eval_grid_fast(*args)
    def _eval_grid_fast(self, *args):
        # TODO: This does not work correctly yet! Check it.
        X = np.vstack(args)
@ -509,29 +604,6 @@ class KDE(object):
        ix = (slice(0, inc),)*d
        return z[ix] * (z[ix] > 0.0)
    def eval_grid(self, *args):
        """Evaluate the estimated pdf on a grid.
        Parameters
        ----------
        arg_0,arg_1,... arg_d-1 : vectors
            Alternatively, if no vectors is passed in then
             arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
        Returns
        -------
        values : array-like
            The values evaluated at meshgrid(*args).
        """
        if len(args)==0:
            args = []
            for i in range(self.d):
                args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
        self.args = args
        return self._eval_grid(*args)
    def _eval_grid(self, *args):
        grd = meshgrid(*args) if len(args) > 1 else list(args)
@ -539,23 +611,11 @@ class KDE(object):
        d = len(grd)
        for i in range(d):
            grd[i] = grd[i].ravel()
-        f = self.evaluate(np.vstack(grd))
+        f = self.eval_points(np.vstack(grd))
        return f.reshape(shape0)
-    def _check_shape(self, points):
+
-        points = atleast_2d(points)
+    def _eval_points(self, points):
        d, m = points.shape
        if d != self.d:
            if d == 1 and m == self.d:
                # points was passed in as a row vector
                points = np.reshape(points, (self.d, 1))
                m = 1
            else:
                msg = "points have dimension %s, dataset has dimension %s" % (d,
                    self.d)
                raise ValueError(msg)
        return points   
    def evaluate(self, points):
        """Evaluate the estimated pdf on a set of points.
        Parameters
@ -574,8 +634,6 @@ class KDE(object):
        ValueError if the dimensionality of the input points is different than
        the dimensionality of the KDE.
        """
        points = self._check_shape(points)
        d, m = points.shape
        result = np.zeros((m,))
@ -598,8 +656,6 @@ class KDE(object):
        return result
    __call__ = evaluate
 class _Kernel(object):
    def __init__(self, r=1.0, stats=None):
@ -898,7 +954,7 @@ class Kernel(object):
        # R= int(mkernel(x)^2),  mu2= int(x^2*mkernel(x))
        mu2, R, Rdd = self.stats()
        AMISEconstant = (8 * sqrt(pi) * R / (3 * mu2 ** 2 * n)) ** (1. / 5)
-        iqr = np.abs(np.percentile(A, 75, axis=1) - np.percentile(A, 25, axis=1))# interquartile range
+        iqr = iqrange(A, axis=1) # interquartile range
        stdA = np.std(A, axis=1, ddof=1)
        #  % use of interquartile range guards against outliers.
        #  % the use of interquartile range is better if 
@ -1068,7 +1124,7 @@ class Kernel(object):
        ax1 = amin - arange / 8.0
        bx1 = amax + arange / 8.0
-        kernel2 = Kernel('gaus') 
+        kernel2 = Kernel('gauss') 
        mu2, R, Rdd = kernel2.stats()
        STEconstant2 = R / (mu2 ** (2) * n)
        fft = np.fft.fft
@ -1142,9 +1198,9 @@ class Kernel(object):
    def norm_factor(self, d=1, n=None):
        return  self.kernel.norm_factor(d, n)    
-    def evaluate(self, X):
+    def eval_points(self, points):
-        return self.kernel(np.atleast_2d(X))
+        return self.kernel(np.atleast_2d(points))
-    __call__ = evaluate
+    __call__ = eval_points
 def mkernel(X, kernel):
    '''
@ -1297,6 +1353,39 @@ def accum(accmap, a, func=None, size=None, fill_value=0, dtype=None):
    return out
 def iqrange(data, axis=None):
    '''
    Returns the Inter Quartile Range of data
    Parameters
    ----------
    data : array-like
        Input array or object that can be converted to an array.
    axis : {None, int}, optional
        Axis along which the percentiles are computed. The default (axis=None)
        is to compute the median along a flattened version of the array.
    Returns
    -------
    r : array-like
        abs(np.percentile(data, 75, axis)-np.percentile(data, 25, axis))
    Notes
    -----    
    IQRANGE is a robust measure of spread. The use of interquartile range 
    guards against outliers if the distribution have heavy tails.
    Example
    -------
    >>> a = np.arange(101)
    >>> iqrange(a)
    50.0
    See also  
    --------
    np.std
    '''
    return np.abs(np.percentile(data, 75, axis=axis)-np.percentile(data, 25, axis=axis))
 def bitget(int_type, offset):
    '''
    Returns the value of the bit at the offset position in int_type.
--- a/pywafo/src/wafo/test/test_kdetools.py
+++ b/pywafo/src/wafo/test/test_kdetools.py
@ -25,14 +25,16 @@ def test0_KDE1D():
    >>> kde0.eval_grid(x)
    array([ 0.2039735 ,  0.40252503,  0.54595078,  0.52219649,  0.3906213 ,
            0.26381501,  0.16407362,  0.08270612,  0.02991145,  0.00720821])
-    
+    >>> kde0.eval_grid_fast(x)
    array([ 0.32343789,  0.51366167,  0.55643329,  0.43688805,  0.28972471,
            0.19445277,  0.12473331,  0.06195215,  0.02087712,  0.00449567])
    >>> f = kde0.eval_grid_fast(); f
-    array([ 0.07264948,  0.14135253,  0.24141397,  0.36045498,  0.46962192,
+    array([ 0.02076721,  0.0612371 ,  0.14515308,  0.27604202,  0.42001793,
-            0.53604004,  0.5427015 ,  0.49767387,  0.42419428,  0.34349993,
+            0.51464781,  0.52131018,  0.45976136,  0.37621768,  0.29589521,
-            0.26650289,  0.19666903,  0.13569857,  0.0857818 ,  0.04868357,
+            0.21985316,  0.1473364 ,  0.08502256,  0.04063749,  0.0155788 ,
-            0.02432961])
+            0.00466938])
    >>> np.trapz(f,kde0.args)
-    array([ 0.97384215])
+    array([ 0.99416766])
    '''
 def test1_TKDE1D():
    '''