Made a baseclass _KDE for KDE and TKDE + updated tests

15 years ago · 171a0cd0c6
parent f8c543b94e
commit 171a0cd0c6
2 changed files with 353 additions and 262 deletions
--- a/pywafo/src/wafo/kdetools.py
+++ b/pywafo/src/wafo/kdetools.py
@ -57,9 +57,172 @@ def sphere_volume(d, r=1.0):
    'Kernel smoothing'
    Chapman and Hall, pp 105
    """
-    return (r ** d) * 2. * pi ** (d / 2.) / (d * gamma(d / 2.))
+    return (r ** d) * 2.0 * pi ** (d / 2.0) / (d * gamma(d / 2.0))

-class TKDE(object):
+class _KDE(object):
+    """ Kernel-Density Estimator base class.
+
+    Parameters
+    ----------
+    data : (# of dims, # of data)-array
+        datapoints to estimate from
+    hs : array-like (optional) 
+        smooting parameter vector/matrix.
+        (default compute from data using kernel.get_smoothing function)
+    kernel :  kernel function object.
+        kernel must have get_smoothing method
+    alpha : real scalar (optional)
+        sensitivity parameter               (default 0 regular KDE)
+        A good choice might be alpha = 0.5 ( or 1/D)
+        alpha = 0      Regular  KDE (hs is constant)
+        0 < alpha <= 1 Adaptive KDE (Make hs change)  
+
+
+    Members
+    -------
+    d : int
+        number of dimensions
+    n : int
+        number of datapoints
+
+    Methods
+    -------
+    kde.eval_grid_fast(x0, x1,..., xd) : array
+        evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
+    kde.eval_grid(x0, x1,..., xd) : array
+        evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
+    kde.eval_points(points) : array
+        evaluate the estimated pdf on a provided set of points
+    kde(x0, x1,..., xd) : array
+        same as kde.eval_grid(x0, x1,..., xd)
+    """
+
+    def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None, xmax=None, inc=128):
+        self.dataset = atleast_2d(data)
+        self.hs = hs
+        self.kernel = kernel if kernel else Kernel('gauss')
+        self.alpha = alpha
+        self.xmin = xmin
+        self.xmax = xmax
+        self.inc = inc
+        self.initialize()
+
+    def initialize(self):
+        self.d, self.n = self.dataset.shape
+        self._set_xlimits()
+        self._initialize()
+        
+    def _initialize(self):
+        pass
+    
+    def _set_xlimits(self):
+        amin = self.dataset.min(axis= -1)
+        amax = self.dataset.max(axis= -1)
+        iqr = iqrange(self.dataset, axis=-1)
+        sigma = np.minimum(np.std(self.dataset, axis=-1, ddof=1),iqr/1.34)
+        #xyzrange = amax - amin
+        #offset = xyzrange / 4.0
+        offset  = 2*sigma
+        if self.xmin is None:
+            self.xmin = amin - offset
+        else:
+            self.xmin = self.xmin * np.ones(self.d)
+        if self.xmax is None:
+            self.xmax = amax + offset
+        else:
+            self.xmax = self.xmax * np.ones(self.d)
+            
+    def eval_grid_fast(self, *args):
+        """Evaluate the estimated pdf on a grid.
+
+        Parameters
+        ----------
+        arg_0,arg_1,... arg_d-1 : vectors
+            Alternatively, if no vectors is passed in then
+             arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
+
+        Returns
+        -------
+        values : array-like
+            The values evaluated at meshgrid(*args).
+
+        """
+        if len(args) == 0:
+            args = []
+            for i in range(self.d):
+                args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
+        self.args = args
+        return self._eval_grid_fast(*args)
+    def _eval_grid_fast(self, *args):
+        pass
+
+    def eval_grid(self, *args):
+        """Evaluate the estimated pdf on a grid.
+
+        Parameters
+        ----------
+        arg_0,arg_1,... arg_d-1 : vectors
+            Alternatively, if no vectors is passed in then
+             arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
+
+        Returns
+        -------
+        values : array-like
+            The values evaluated at meshgrid(*args).
+
+        """
+
+        if len(args) == 0:
+            args = []
+            for i in range(self.d):
+                args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
+        self.args = args
+        return self._eval_grid(*args)
+    
+    def _eval_grid(self, *args):
+        pass
+    
+    def _check_shape(self, points):
+        points = atleast_2d(points)
+        d, m = points.shape
+        if d != self.d:
+            if d == 1 and m == self.d:
+                # points was passed in as a row vector
+                points = np.reshape(points, (self.d, 1))
+            else:
+                msg = "points have dimension %s, dataset has dimension %s" % (d,
+                    self.d)
+                raise ValueError(msg)
+        return points   
+    def eval_points(self, points):
+        """Evaluate the estimated pdf on a set of points.
+
+        Parameters
+        ----------
+        points : (# of dimensions, # of points)-array
+            Alternatively, a (# of dimensions,) vector can be passed in and
+            treated as a single point.
+
+        Returns
+        -------
+        values : (# of points,)-array
+            The values at each point.
+
+        Raises
+        ------
+        ValueError if the dimensionality of the input points is different than
+        the dimensionality of the KDE.
+        """
+
+        points = self._check_shape(points)
+        return self._eval_points(points)
+    
+    def _eval_points(self, points):
+        pass
+
+    __call__ = eval_grid
+    
+class TKDE(_KDE):
    """ Transformation Kernel-Density Estimator.

    Parameters
@ -76,6 +239,17 @@ class TKDE(object):
        A good choice might be alpha = 0.5 ( or 1/D)
        alpha = 0      Regular  KDE (hs is constant)
        0 < alpha <= 1 Adaptive KDE (Make hs change)
+    xmin, xmax  : vectors 
+        specifying the default argument range for the kde.eval_grid methods. 
+        For the kde.eval_grid_fast methods the values must cover the range of the data. 
+        (default min(data)-range(data)/4, max(data)-range(data)/4)
+        If a single value of xmin or xmax is given then the boundary is the is 
+        the same for all dimensions.
+    inc :  scalar integer
+        defining the default dimension of the output from kde.eval_grid methods (default 128)
+        (For kde.eval_grid_fast: A value below 50 is very fast to compute but 
+        may give some inaccuracies. Values between 100 and 500 give very 
+        accurate results)  
    L2 : array-like 
        vector of transformation parameters (default 1 no transformation)
        t(xi;L2) = xi^L2*sign(L2)   for L2(i) ~= 0
@ -91,10 +265,14 @@ class TKDE(object):

    Methods
    -------
-    kde.evaluate(points) : array
+    kde.eval_grid_fast(x0, x1,..., xd) : array
+        evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
+    kde.eval_grid(x0, x1,..., xd) : array
+        evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
+    kde.eval_points(points) : array
        evaluate the estimated pdf on a provided set of points
-    kde(points) : array
-        same as kde.evaluate(points)
+    kde(x0, x1,..., xd) : array
+        same as kde.eval_grid(x0, x1,..., xd)
   
    
    Example
@ -119,8 +297,8 @@ class TKDE(object):
            0.20717946,  0.15907684,  0.1201074 ,  0.08941027,  0.06574882])
            
    >>> kde.eval_grid_fast(x)
-    array([ 0.        ,  0.4614821 ,  0.39554839,  0.32764086,  0.26275681,
-            0.20543731,  0.15741056,  0.11863464,  0.        ,  0.        ])
+    array([ 1.06437223,  0.46203314,  0.39593137,  0.32781899,  0.26276433,
+            0.20532206,  0.15723498,  0.11843998,  0.08797755,  0.        ])
            
    import pylab as plb          
    h1 = plb.plot(x, f) #  1D probability density plot
@ -129,19 +307,11 @@ class TKDE(object):

    def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None,
                 xmax=None, inc=128, L2=None):
-        self.dataset = atleast_2d(data)
-        self.hs = hs
-        self.kernel = kernel if kernel else Kernel('gauss')
-        self.alpha = alpha
-        self.xmin = xmin
-        self.xmax = xmax
-        self.inc = inc
        self.L2 = L2
-        self.d, self.n = self.dataset.shape
-        self.initialize()
+        _KDE.__init__(self, data, hs, kernel, alpha, xmin, xmax, inc)
    
-    def initialize(self):
-        self._set_xlimits()
+    def _initialize(self):
+        self._check_xmin()
        tdataset = self._dat2gaus(self.dataset)
        xmin = self.xmin
        if xmin is not None:
@ -151,38 +321,11 @@ class TKDE(object):
            xmax = self._dat2gaus(xmax)
        self.tkde = KDE(tdataset, self.hs, self.kernel, self.alpha, xmin, xmax,
                       self.inc)
-    def _set_xlimits(self):
-        amin = self.dataset.min(axis=-1)
-        amax = self.dataset.max(axis=-1)
-        xyzrange = amax-amin
-        offset = xyzrange/4.0
-        if self.xmin is None:
-            self.xmin = amin - offset
-        else:
-            self.xmin = self.xmin * np.ones(self.d)
-        if self.xmax is None:
-            self.xmax = amax + offset
-        else:
-            self.xmax = self.xmax * np.ones(self.d)
-        
+    def _check_xmin(self):
        if self.L2 is not None:
+            amin = self.dataset.min(axis= -1)
            L2 = np.atleast_1d(self.L2) * np.ones(self.d) # default no transformation
-            self.xmin = np.where(L2!=1, np.maximum(self.xmin, amin/2.0), self.xmin)
-            
-        
-    def _check_shape(self, points):
-        points = atleast_2d(points)
-        d, m = points.shape
-        if d != self.d:
-            if d == 1 and m == self.d:
-                # points was passed in as a row vector
-                points = np.reshape(points, (self.d, 1))
-                m = 1
-            else:
-                msg = "points have dimension %s, dataset has dimension %s" % (d,
-                    self.d)
-                raise ValueError(msg)
-        return points   
+            self.xmin = np.where(L2 != 1, np.maximum(self.xmin, amin / 100.0), self.xmin)
            
    def _dat2gaus(self, points):
        if self.L2 is None:
@ -218,14 +361,15 @@ class TKDE(object):
                    transformation. Check the KDE for spurious spikes'''
            warnings.warn(msg)
        return pdf
-    def eval_grid_fast(self, *args):
+    
+    def eval_grid_fast2(self, *args):
        """Evaluate the estimated pdf on a grid.
        
        Parameters
        ----------
        arg_0,arg_1,... arg_d-1 : vectors
           Alternatively, if no vectors is passed in then
-             arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
+            arg_i = gauss2dat(linspace(dat2gauss(self.xmin[i]), dat2gauss(self.xmax[i]), self.inc))
        
        Returns
        -------
@ -233,7 +377,9 @@ class TKDE(object):
           The values evaluated at meshgrid(*args).
        
        """
+        return self._eval_grid_fast(*args)
    
+    def _eval_grid_fast(self, *args): 
        if self.L2 is None:
            f = self.tkde.eval_grid_fast(*args)
            self.args = self.tkde.args
@ -253,26 +399,7 @@ class TKDE(object):
            #fi.shape = ipoints[0].shape
            return fi
        return f
-    def eval_grid(self, *args):
-        """Evaluate the estimated pdf on a grid.
-
-        Parameters
-        ----------
-        arg_0,arg_1,... arg_d-1 : vectors
-            Alternatively, if no vectors is passed in then
-             arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
-
-        Returns
-        -------
-        values : array-like
-            The values evaluated at meshgrid(*args).
-
-        """
-        if len(args)==0:
-            args = []
-            for i in range(self.d):
-                args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
-        self.args = args
+    def _eval_grid(self, *args):
        if self.L2 is None:
            return self.tkde.eval_grid(*args)
        targs = self._dat2gaus(list(args))
@ -281,8 +408,7 @@ class TKDE(object):
        f = self._scale_pdf(tf, points)
        return f

-        return self.tkde.eval_grid(*args)
-    def evaluate(self, points):
+    def _eval_points(self, points):
        """Evaluate the estimated pdf on a set of points.

        Parameters
@ -302,16 +428,14 @@ class TKDE(object):
        the dimensionality of the KDE.
        """
        if self.L2 is None:
-            return self.tkde(points)
-        points = self._check_shape(points)
+            return self.tkde.eval_points(points)
+        
        tpoints = self._dat2gaus(points)
-        tf = self.tkde(tpoints)
+        tf = self.tkde.eval_points(tpoints)
        f = self._scale_pdf(tf, points)
        return f
    
-    __call__ = evaluate
-    
-class KDE(object):
+class KDE(_KDE):
    """ Kernel-Density Estimator.

    Parameters
@ -328,7 +452,17 @@ class KDE(object):
        A good choice might be alpha = 0.5 ( or 1/D)
        alpha = 0      Regular  KDE (hs is constant)
        0 < alpha <= 1 Adaptive KDE (Make hs change)  
-
+    xmin, xmax  : vectors 
+        specifying the default argument range for the kde.eval_grid methods. 
+        For the kde.eval_grid_fast methods the values must cover the range of the data. 
+        (default min(data)-range(data)/4, max(data)-range(data)/4)
+        If a single value of xmin or xmax is given then the boundary is the is 
+        the same for all dimensions.
+    inc :  scalar integer
+        defining the default dimension of the output from kde.eval_grid methods (default 128)
+        (For kde.eval_grid_fast: A value below 50 is very fast to compute but 
+        may give some inaccuracies. Values between 100 and 500 give very 
+        accurate results)  

    Members
    -------
@ -339,10 +473,14 @@ class KDE(object):

    Methods
    -------
-    kde.evaluate(points) : array
+    kde.eval_grid_fast(x0, x1,..., xd) : array
+        evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
+    kde.eval_grid(x0, x1,..., xd) : array
+        evaluate the estimated pdf on meshgrid(x0, x1,..., xd)
+    kde.eval_points(points) : array
        evaluate the estimated pdf on a provided set of points
-    kde(points) : array
-        same as kde.evaluate(points)
+    kde(x0, x1,..., xd) : array
+        same as kde.eval_grid(x0, x1,..., xd)
   
    
    Example
@ -367,7 +505,7 @@ class KDE(object):
            0.21409279,  0.12738463,  0.07460326,  0.03956191,  0.01887164])
    
    >>> kde0 = wk.KDE(data, hs=0.5, alpha=0.0)
-    >>> kde0.evaluate(x)
+    >>> kde0.eval_points(x)
    array([ 0.2039735 ,  0.40252503,  0.54595078,  0.52219649,  0.3906213 ,
            0.26381501,  0.16407362,  0.08270612,  0.02991145,  0.00720821])
    
@ -377,8 +515,8 @@ class KDE(object):
    
    >>> f = kde0.eval_grid_fast()
    >>> np.interp(x, kde0.args[0], f)
-    array([ 0.21165996,  0.41218257,  0.54961961,  0.51713209,  0.38292245,
-            0.25864661,  0.16113184,  0.08055992,  0.03576856,  0.03576856])
+    array([ 0.21227584,  0.41256459,  0.5495661 ,  0.5176579 ,  0.38431616,
+            0.2591162 ,  0.15978948,  0.07889179,  0.02769818,  0.00791829])
            
    import pylab as plb          
    h1 = plb.plot(x, f) #  1D probability density plot
@ -386,41 +524,18 @@ class KDE(object):
    """

    def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None, xmax=None, inc=128):
-        self.kernel = kernel if kernel else Kernel('gauss')
-        self.hs = hs
-        self.alpha = alpha
-        
-        self.dataset = atleast_2d(data)
-        self.d, self.n = self.dataset.shape
-        self.xmin = xmin
-        self.xmax = xmax
-        self.inc = inc
-        self.initialize()
+        _KDE.__init__(self, data, hs, kernel, alpha, xmin, xmax, inc)
            
-    def initialize(self):
-        self._set_xlimits()
+    def _initialize(self):
        self._compute_smoothing()
        if self.alpha > 0:
            pilot = KDE(self.dataset, hs=self.hs, kernel=self.kernel, alpha=0)
-            f = pilot(self.dataset) # get a pilot estimate by regular KDE (alpha=0)
+            f = pilot.eval_points(self.dataset) # get a pilot estimate by regular KDE (alpha=0)
            g = np.exp(np.mean(np.log(f)))
            self._lambda = (f / g) ** (-self.alpha)
        else:
            self._lambda = np.ones(self.n)
                
-    def _set_xlimits(self):
-        amin = self.dataset.min(axis=-1)
-        amax = self.dataset.max(axis=-1)
-        xyzrange = amax-amin
-        if self.xmin is None:
-            self.xmin = amin-xyzrange/4.0
-        else:
-            self.xmin = self.xmin * np.ones(self.d)
-        if self.xmax is None:
-            self.xmax = amax + xyzrange/4.0
-        else:
-            self.xmax = self.xmax * np.ones(self.d)
-            
    def _compute_smoothing(self):
        """Computes the smoothing matrix
        """
@ -451,27 +566,7 @@ class KDE(object):
        self.hs = h
        self._norm_factor = deth * self.n
    
-    def eval_grid_fast(self, *args):
-        """Evaluate the estimated pdf on a grid.
-
-        Parameters
-        ----------
-        arg_0,arg_1,... arg_d-1 : vectors
-            Alternatively, if no vectors is passed in then
-             arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)

-        Returns
-        -------
-        values : array-like
-            The values evaluated at meshgrid(*args).
-
-        """
-        if len(args)==0:
-            args = []
-            for i in range(self.d):
-                args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
-        self.args = args
-        return self._eval_grid_fast(*args)
    def _eval_grid_fast(self, *args):
        # TODO: This does not work correctly yet! Check it.
        X = np.vstack(args)
@ -509,29 +604,6 @@ class KDE(object):
        ix = (slice(0, inc),)*d
        return z[ix] * (z[ix] > 0.0)
   
-    def eval_grid(self, *args):
-        """Evaluate the estimated pdf on a grid.
-
-        Parameters
-        ----------
-        arg_0,arg_1,... arg_d-1 : vectors
-            Alternatively, if no vectors is passed in then
-             arg_i = linspace(self.xmin[i], self.xmax[i], self.inc)
-
-        Returns
-        -------
-        values : array-like
-            The values evaluated at meshgrid(*args).
-
-        """
-
-        if len(args)==0:
-            args = []
-            for i in range(self.d):
-                args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc))
-        self.args = args
-        return self._eval_grid(*args)
-    
    def _eval_grid(self, *args):
        
        grd = meshgrid(*args) if len(args) > 1 else list(args)
@ -539,23 +611,11 @@ class KDE(object):
        d = len(grd)
        for i in range(d):
            grd[i] = grd[i].ravel()
-        f = self.evaluate(np.vstack(grd))
+        f = self.eval_points(np.vstack(grd))
        return f.reshape(shape0)
    
-    def _check_shape(self, points):
-        points = atleast_2d(points)
-        d, m = points.shape
-        if d != self.d:
-            if d == 1 and m == self.d:
-                # points was passed in as a row vector
-                points = np.reshape(points, (self.d, 1))
-                m = 1
-            else:
-                msg = "points have dimension %s, dataset has dimension %s" % (d,
-                    self.d)
-                raise ValueError(msg)
-        return points   
-    def evaluate(self, points):
+
+    def _eval_points(self, points):
        """Evaluate the estimated pdf on a set of points.

        Parameters
@ -574,8 +634,6 @@ class KDE(object):
        ValueError if the dimensionality of the input points is different than
        the dimensionality of the KDE.
        """
-
-        points = self._check_shape(points)
        d, m = points.shape
       
        result = np.zeros((m,))
@ -598,8 +656,6 @@ class KDE(object):

        return result

-    __call__ = evaluate
-
    
 class _Kernel(object):
    def __init__(self, r=1.0, stats=None):
@ -898,7 +954,7 @@ class Kernel(object):
        # R= int(mkernel(x)^2),  mu2= int(x^2*mkernel(x))
        mu2, R, Rdd = self.stats()
        AMISEconstant = (8 * sqrt(pi) * R / (3 * mu2 ** 2 * n)) ** (1. / 5)
-        iqr = np.abs(np.percentile(A, 75, axis=1) - np.percentile(A, 25, axis=1))# interquartile range
+        iqr = iqrange(A, axis=1) # interquartile range
        stdA = np.std(A, axis=1, ddof=1)
        #  % use of interquartile range guards against outliers.
        #  % the use of interquartile range is better if 
@ -1068,7 +1124,7 @@ class Kernel(object):
        ax1 = amin - arange / 8.0
        bx1 = amax + arange / 8.0
        
-        kernel2 = Kernel('gaus') 
+        kernel2 = Kernel('gauss') 
        mu2, R, Rdd = kernel2.stats()
        STEconstant2 = R / (mu2 ** (2) * n)
        fft = np.fft.fft
@ -1142,9 +1198,9 @@ class Kernel(object):
    
    def norm_factor(self, d=1, n=None):
        return  self.kernel.norm_factor(d, n)    
-    def evaluate(self, X):
-        return self.kernel(np.atleast_2d(X))
-    __call__ = evaluate
+    def eval_points(self, points):
+        return self.kernel(np.atleast_2d(points))
+    __call__ = eval_points
    
 def mkernel(X, kernel):
    '''
@ -1297,6 +1353,39 @@ def accum(accmap, a, func=None, size=None, fill_value=0, dtype=None):

    return out

+def iqrange(data, axis=None):
+    '''
+    Returns the Inter Quartile Range of data
+    
+    Parameters
+    ----------
+    data : array-like
+        Input array or object that can be converted to an array.
+    axis : {None, int}, optional
+        Axis along which the percentiles are computed. The default (axis=None)
+        is to compute the median along a flattened version of the array.
+    Returns
+    -------
+    r : array-like
+        abs(np.percentile(data, 75, axis)-np.percentile(data, 25, axis))
+    
+    Notes
+    -----    
+    IQRANGE is a robust measure of spread. The use of interquartile range 
+    guards against outliers if the distribution have heavy tails.
+    
+    Example
+    -------
+    >>> a = np.arange(101)
+    >>> iqrange(a)
+    50.0
+    
+    See also  
+    --------
+    np.std
+    '''
+    return np.abs(np.percentile(data, 75, axis=axis)-np.percentile(data, 25, axis=axis))
+
 def bitget(int_type, offset):
    '''
    Returns the value of the bit at the offset position in int_type.
--- a/pywafo/src/wafo/test/test_kdetools.py
+++ b/pywafo/src/wafo/test/test_kdetools.py
@ -25,14 +25,16 @@ def test0_KDE1D():
    >>> kde0.eval_grid(x)
    array([ 0.2039735 ,  0.40252503,  0.54595078,  0.52219649,  0.3906213 ,
            0.26381501,  0.16407362,  0.08270612,  0.02991145,  0.00720821])
-    
+    >>> kde0.eval_grid_fast(x)
+    array([ 0.32343789,  0.51366167,  0.55643329,  0.43688805,  0.28972471,
+            0.19445277,  0.12473331,  0.06195215,  0.02087712,  0.00449567])
    >>> f = kde0.eval_grid_fast(); f
-    array([ 0.07264948,  0.14135253,  0.24141397,  0.36045498,  0.46962192,
-            0.53604004,  0.5427015 ,  0.49767387,  0.42419428,  0.34349993,
-            0.26650289,  0.19666903,  0.13569857,  0.0857818 ,  0.04868357,
-            0.02432961])
+    array([ 0.02076721,  0.0612371 ,  0.14515308,  0.27604202,  0.42001793,
+            0.51464781,  0.52131018,  0.45976136,  0.37621768,  0.29589521,
+            0.21985316,  0.1473364 ,  0.08502256,  0.04063749,  0.0155788 ,
+            0.00466938])
    >>> np.trapz(f,kde0.args)
-    array([ 0.97384215])
+    array([ 0.99416766])
    '''
 def test1_TKDE1D():
    '''