fixed a bug in Kernel.name and simplified glevels and made it general.

9 years ago · 669fadc9a5
parent 40bfa22d46
commit 669fadc9a5
3 changed files with 174 additions and 129 deletions
--- a/wafo/kdetools/kernels.py
+++ b/wafo/kdetools/kernels.py
@ -22,6 +22,12 @@ def _assert(cond, msg):
    if not cond:
        raise ValueError(msg)
 def _assert_warn(cond, msg):
    if not cond:
        warnings.warn(msg)
 # stats = (mu2, R, Rdd) where
 #     mu2 : 2'nd order moment, i.e.,int(x^2*kernel(x))
 #     R :  integral of squared kernel, i.e., int(kernel(x)^2)
@ -36,31 +42,39 @@ _stats_logi = (pi ** 2 / 3, 1. / 6, 1 / 42)
 _stats_gaus = (1, 1. / (2 * sqrt(pi)), 3. / (8 * sqrt(pi)))
-def qlevels(pdf, p=(10, 30, 50, 70, 90, 95, 99, 99.9), x1=None, x2=None):
+def qlevels(pdf, p=(10, 30, 50, 70, 90, 95, 99, 99.9), xi=(), indexing='xy'):
-    """QLEVELS Calculates quantile levels which encloses P% of PDF.
+    """QLEVELS Calculates quantile levels which encloses P% of pdf.
-      CALL: [ql PL] = qlevels(pdf,PL,x1,x2);
+    Parameters
    ----------
    pdf: array-like
        joint point density function given as array or vector
    p : float in range of [0,100] (or sequence of floats)
        Percentage to compute which must be between 0 and 100 inclusive.
    xi : tuple
        input arguments to the pdf, i.e., (x0, x1,...., xn)
    indexing : {'xy', 'ij'}, optional
        Cartesian ('xy', default) or matrix ('ij') indexing of pdf.
        See numpy.meshgrid for more details.
-            ql    = the discrete quantile levels.
+    Returns
-            pdf   = joint point density function matrix or vector
+    ------
-            PL    = percent level (default [10:20:90 95 99 99.9])
+    levels: array-like
-            x1,x2 = vectors of the spacing of the variables
+        discrete levels which encloses P% of pdf
                   (Default unit spacing)
    QLEVELS numerically integrates PDF by decreasing height and find the
-    quantile levels which  encloses P% of the distribution. If X1 and
+    quantile levels which  encloses P% of the distribution.
-    (or) X2 is unspecified it is assumed that dX1 and dX2 is constant.
+
-    NB! QLEVELS normalizes the integral of PDF to N/(N+0.001) before
+    If Xi is unspecified it is assumed that dX0, dX1,..., and dXn is constant.
-    calculating QL in order to reflect the sampling of PDF is finite.
+    NB! QLEVELS normalizes the integral of PDF to n/(n+0.001) before
-    Currently only able to handle 1D and 2D PDF's if dXi is not constant
+    calculating 'levels' in order to reflect the sampling of PDF is finite.
    (i=1,2).
    Example
    -------
    >>> import wafo.stats as ws
    >>> x = np.linspace(-8,8,2001);
    >>> PL = np.r_[10:90:20, 90, 95, 99, 99.9]
-    >>> qlevels(ws.norm.pdf(x),p=PL, x1=x);
+    >>> qlevels(ws.norm.pdf(x),p=PL, xi=(x,));
    array([ 0.39591707,  0.37058719,  0.31830968,  0.23402133,  0.10362052,
            0.05862129,  0.01449505,  0.00178806])
@ -74,73 +88,58 @@ def qlevels(pdf, p=(10, 30, 50, 70, 90, 95, 99, 99.9), x1=None, x2=None):
    qlevels2, tranproc
    """
    def _dx(x):
        dx = np.diff(x.ravel()) * 0.5
        return np.r_[0, dx] + np.r_[dx, 0]
    def _init(pdf, xi, indexing):
        if not xi:
            return pdf.ravel()
        if not isinstance(xi, tuple):
            xi = (xi,)
        dx = np.meshgrid(*[_dx(x) for x in xi], sparse=True, indexing=indexing)
        dxij = np.ones((1))
        for dxi in dx:
            dxij = dxij * dxi
        _assert(dxij.shape == pdf.shape,
                'Shape of pdf does not match the arguments')
        return (pdf * dxij).ravel()
    def _check_levels(levels, pdf):
        _assert_warn(not np.any(levels >= max(pdf.ravel())),
                     'The lowest percent level is too close to 0%')
        _assert_warn(not np.any(levels <= min(pdf.ravel())),
                     'The given pdf is too sparsely sampled or the highest '
                     'percent level is too close to 100%')
    pdf, p = np.atleast_1d(pdf, p)
    _assert(not any(pdf.ravel() < 0),
            'This is not a pdf since one or more values of pdf is negative')
    _assert(not np.any((p < 0) | (100 < p)), 'PL must satisfy 0 <= PL <= 100')
-    norm = 1  # normalize cdf to unity
+    if min(pdf.shape) == 0:
    pdf = np.atleast_1d(pdf)
    _assert(not any(pdf.ravel() < 0), 'This is not a pdf since one or more '
            'values of pdf is negative')
    fsiz = pdf.shape
    fsizmin = min(fsiz)
    if fsizmin == 0:
        return []
    N = np.prod(fsiz)
    d = len(fsiz)
    if x1 is None or ((x2 is None) and d > 2):
        fdfi = pdf.ravel()
    else:
        if d == 1:  # pdf in one dimension
            dx22 = np.ones(1)
        else:  # % pdf in two dimensions
            dx2 = np.diff(x2.ravel()) * 0.5
            dx22 = np.r_[0, dx2] + np.r_[dx2, 0]
        dx1 = np.diff(x1.ravel()) * 0.5
        dx11 = np.r_[0, dx1] + np.r_[dx1, 0]
        dx1x2 = dx22[:, None] * dx11
        fdfi = (pdf * dx1x2).ravel()
    p = np.atleast_1d(p)
    _assert(not np.any((p < 0) | (100 < p)), 'PL must satisfy 0 <= PL <= 100')
    p2 = p / 100.0
    ind = np.argsort(pdf.ravel())  # sort by height of pdf
    ind = ind[::-1]
-    fi = pdf.flat[ind]
+    sorted_pdf = pdf.flat[ind]
    pdf_dx = _init(pdf, xi, indexing=indexing)
    # integration in the order of decreasing height of pdf
-    Fi = np.cumsum(fdfi[ind])
+    cdf = np.cumsum(pdf_dx[ind])
-
+    n = pdf_dx.size
-    if norm:  # normalize Fi to make sure int pdf dx1 dx2 approx 1
+    # normalize cdf to make sure int pdf dx1 dx2 approx 1
-        Fi = Fi / Fi[-1] * N / (N + 1.5e-8)
+    cdf = cdf / cdf[-1] * n / (n + 1.5e-8)
    maxFi = np.max(Fi)
    if maxFi > 1:
        warnings.warn('this is not a pdf since cdf>1! normalizing')
        Fi = Fi / Fi[-1] * N / (N + 1.5e-8)
-    elif maxFi < .95:
+    # make sure cdf is strictly increasing by not considering duplicate values
-        msg = '''The given pdf is too sparsely sampled since cdf<.95.
+    ind, = np.where(np.diff(np.r_[cdf, 1]) > 0)
        Thus QL is questionable'''
        warnings.warn(msg)
    # make sure Fi is strictly increasing by not considering duplicate values
    ind, = np.where(np.diff(np.r_[Fi, 1]) > 0)
    # calculating the inverse of Fi to find the index
    ui = tranproc(Fi[ind], fi[ind], p2)
-    if np.any(ui >= max(pdf.ravel())):
+    # calculating the inverse of cdf to find the levels
-        warnings.warn('The lowest percent level is too close to 0%')
+    levels = tranproc(cdf[ind], sorted_pdf[ind], p / 100.0)
-    if np.any(ui <= min(pdf.ravel())):
+    _check_levels(levels, pdf)
-        msg = '''The given pdf is too sparsely sampled or
+    levels[levels < 0] = 0.0
-       the highest percent level is too close to 100%'''
+    return levels
        warnings.warn(msg)
        ui[ui < 0] = 0.0
    return ui
 def qlevels2(data, p=(10, 30, 50, 70, 90, 95, 99, 99.9), method=1):
@ -262,9 +261,16 @@ def sphere_volume(d, r=1.0):
 class _Kernel(object):
    __metaclass__ = ABCMeta
-    def __init__(self, r=1.0, stats=None):
+    def __init__(self, r=1.0, stats=None, name=''):
-        self.r = r  # radius of kernel
+        self.r = r  # radius of effective support of kernel
        self.stats = stats
        if not name:
            name = self.__class__.__name__.replace('_Kernel', '')
        self._name = name
    @property
    def name(self):
        return self._name
    def norm_factor(self, d=1, n=None):
        _assert(0 < d, "D")
@ -305,12 +311,12 @@ class _KernelMulti(_Kernel):
    p=0;  Sphere = rect for 1D
    p=1;  Multivariate Epanechnikov kernel.
    p=2;  Multivariate Bi-weight Kernel
-    p=3;  Multi variate Tri-weight Kernel
+    p=3;  Multivariate Tri-weight Kernel
-    p=4;  Multi variate Four-weight Kernel
+    p=4;  Multivariate Four-weight Kernel
    """
-    def __init__(self, r=1.0, p=1, stats=None):
+    def __init__(self, r=1.0, p=1, stats=None, name=''):
        self.p = p
-        super(_KernelMulti, self).__init__(r, stats)
+        super(_KernelMulti, self).__init__(r, stats, name)
    def norm_factor(self, d=1, n=None):
        r = self.r
@ -325,9 +331,10 @@ class _KernelMulti(_Kernel):
        x2 = x ** 2
        return ((1.0 - x2.sum(axis=0) / r ** 2).clip(min=0.0)) ** p
-mkernel_epanechnikov = _KernelMulti(p=1, stats=_stats_epan)
+mkernel_epanechnikov = _KernelMulti(p=1, stats=_stats_epan,
-mkernel_biweight = _KernelMulti(p=2, stats=_stats_biwe)
+                                    name='epanechnikov')
-mkernel_triweight = _KernelMulti(p=3, stats=_stats_triw)
+mkernel_biweight = _KernelMulti(p=2, stats=_stats_biwe, name='biweight')
 mkernel_triweight = _KernelMulti(p=3, stats=_stats_triw, name='triweight')
 class _KernelProduct(_KernelMulti):
@ -350,9 +357,11 @@ class _KernelProduct(_KernelMulti):
        pdf = (1 - (x / r) ** 2).clip(min=0.0) ** self.p
        return pdf.prod(axis=0)
-mkernel_p1epanechnikov = _KernelProduct(p=1, stats=_stats_epan)
+mkernel_p1epanechnikov = _KernelProduct(p=1, stats=_stats_epan,
-mkernel_p1biweight = _KernelProduct(p=2, stats=_stats_biwe)
+                                        name='p1epanechnikov')
-mkernel_p1triweight = _KernelProduct(p=3, stats=_stats_triw)
+mkernel_p1biweight = _KernelProduct(p=2, stats=_stats_biwe, name='p1biweight')
 mkernel_p1triweight = _KernelProduct(p=3, stats=_stats_triw,
                                     name='p1triweight')
 class _KernelRectangular(_Kernel):
@ -404,11 +413,6 @@ class _KernelGaussian(_Kernel):
 mkernel_gaussian = _KernelGaussian(r=4.0, stats=_stats_gaus)
 # def mkernel_gaussian(X):
 #    x2 = X ** 2
 #    d = X.shape[0]
 #    return (2 * pi) ** (-d / 2) * exp(-0.5 * x2.sum(axis=0))
 class _KernelLaplace(_Kernel):
@ -439,8 +443,8 @@ _MKERNEL_DICT = dict(
    tria=mkernel_triangular,
    lapl=mkernel_laplace,
    logi=mkernel_logistic,
-    gaus=mkernel_gaussian
+    gaus=mkernel_gaussian)
-)
+
 _KERNEL_EXPONENT_DICT = dict(
    re=0, sp=0, ep=1, bi=2, tr=3, fo=4, fi=5, si=6, se=7)
@ -530,7 +534,7 @@ class Kernel(object):
    @property
    def name(self):
-        return self.kernel.__class__.__name__.replace('_Kernel', '').title()
+        return self.kernel.name
    def stats(self):
        """Return some 1D statistics of the kernel.
@ -586,8 +590,13 @@ class Kernel(object):
        visual check by eye.
        Example:
-          data = rndnorm(0, 1,20,1)
+        -------
-          h = hns(data,'epan')
+        >>> import numpy as np
        >>> import wafo.kdetools as wk
        >>> import wafo.stats as ws
        >>> kernel = wk.Kernel('epan')
        >>> data = ws.norm.rvs(0, 1, size=(1,20))
        >>> h = kernel.hns(data)
        See also:
        ---------
@ -601,7 +610,6 @@ class Kernel(object):
        Wand,M.P. and Jones, M.C. (1995)
        'Kernel smoothing'
        Chapman and Hall, pp 60--63
        """
        a = np.atleast_2d(data)
@ -611,13 +619,13 @@ class Kernel(object):
        mu2, R, _Rdd = self.stats()
        amise_constant = (8 * sqrt(pi) * R / (3 * mu2 ** 2 * n)) ** (1. / 5)
        iqr = iqrange(a, axis=1)  # interquartile range
-        stdA = np.std(a, axis=1, ddof=1)
+        std_a = np.std(a, axis=1, ddof=1)
        # use of interquartile range guards against outliers.
        # the use of interquartile range is better if
        # the distribution is skew or have heavy tails
        # This lessen the chance of oversmoothing.
        return np.where(iqr > 0,
-                        np.minimum(stdA, iqr / 1.349), stdA) * amise_constant
+                        np.minimum(std_a, iqr / 1.349), std_a) * amise_constant
    def hos(self, data):
        """Returns Oversmoothing Parameter.
@ -680,7 +688,8 @@ class Kernel(object):
        elif name == 'gaus':  # Gaussian kernel
            a = (4.0 / (d + 2.0)) ** (1. / (d + 4.0))
        else:
-            raise ValueError('Unknown kernel.')
+            raise NotImplementedError('Hmns bandwidth not implemented for '
                                      'kernel {}.'.format(name))
        return a
    def hmns(self, data):
@ -696,7 +705,6 @@ class Kernel(object):
                    'triweight'     - Tri-weight kernel.
                    'gaussian'      - Gaussian kernel
          Note that only the first 4 letters of the kernel name is needed.
         HMNS  only gives  a optimal value with respect to mean integrated
         square error, when the true underlying distribution is Multivariate
@ -725,7 +733,6 @@ class Kernel(object):
          Chapman and Hall, pp 60--63, 86--88
        """
        # TODO: implement more kernels
        a = np.atleast_2d(data)
        d, n = a.shape
@ -1081,35 +1088,53 @@ class Kernel(object):
                warnings.warn('The obtained value did not converge.')
            h[dim] = h1 * s
-        # end % for dim loop
+        # end # for dim loop
        return h
    def hscv(self, data, hvec=None, inc=128, maxit=100, fulloutput=False):
        '''
        HSCV Smoothed cross-validation estimate of smoothing parameter.
         CALL: [hs,hvec,score] = hscv(data,kernel,hvec)
-           hs     = smoothing parameter
+        Parameters
-           hvec   = vector defining possible values of hs
+        ----------
        data   = data vector
        hvec   = vector defining possible values of hs
                     (default linspace(0.25*h0,h0,100), h0=0.62)
-           score  = score vector
+        inc = length of estimated kerneldensity estimate
-           data   = data vector
+        maxit = maximum number of iterations
-           kernel = 'gaussian'      - Gaussian kernel the only supported
+        fulloutput = True if fulloutput is wanted
          Note that only the first 4 letters of the kernel name is needed.
-          Example:
+        Returns
-            data = rndnorm(0,1,20,1)
+        -------
-             [hs hvec score] = hscv(data,'epan');
+        hs     = smoothing parameter
-             plot(hvec,score)
+        hvec   = vector defining possible values of hs
-         See also  hste, hbcv, hboot, hos, hldpi, hlscv, hstt, kde, kdefun
+        score  = score vector
          Example
          ------
          >>> import wafo.kdetools as wk
          >>> import wafo.stats as ws
          >>> data = ws.norm.rvs(0,1, size=(1,20))
          >>> kernel = wk.Kernel('epan')
          >>> hs0 = kernel.hscv(data, fulloutput=False)
          >>> hs, hvec, score = kernel.hscv(data, fulloutput=True)
          >>> np.allclose(hs, hs0)
          True
         import matplotlib.pyplot as plt
         plt.plot(hvec,score)
         See also:
         hste, hbcv, hboot, hos, hldpi, hlscv, hstt, kde, kdefun
-         Wand,M.P. and Jones, M.C. (1986)
+        Reference
-         'Kernel smoothing'
+        ---------
-          Chapman and Hall, pp 75--79
+        Wand,M.P. and Jones, M.C. (1986)
        'Kernel smoothing'
        Chapman and Hall, pp 75--79
        '''
-        # TODO: Add support for other kernels than Gaussian
+
        A = np.atleast_2d(data)
        d, n = A.shape
@ -1209,18 +1234,17 @@ class Kernel(object):
            idx = score.argmin()
            # Kernel other than Gaussian scale bandwidth
            h[dim] = hvec[idx] * (ste_constant / ste_constant2) ** (1 / 5)
-            if idx == 0:
+            _assert_warn(0 < idx,
-                warnings.warn("Optimum is probably lower than "
+                         "Optimum is probably lower than "
-                              "hs={0:g} for dim={1:d}".format(h[dim] * s, dim))
+                         "hs={0:g} for dim={1:d}".format(h[dim] * s, dim))
-            elif idx == maxit - 1:
+            _assert_warn(idx < maxit - 1,
-                msg = "Optimum is probably higher than hs={0:g] for dim={1:d}"
+                         "Optimum is probably higher than "
-                warnings.warn(msg.format(h[dim] * s, dim))
+                         "hs={0:g} for dim={1:d}".format(h[dim] * s, dim))
        hvec = hvec * (ste_constant / ste_constant2) ** (1 / 5)
        if fulloutput:
-            return h * sigmaA, score, hvec, sigmaA
+            return h * sigmaA, score, hvec
-        else:
+        return h * sigmaA
            return h * sigmaA
    def hldpi(self, data, L=2, inc=128):
        '''HLDPI L-stage Direct Plug-In estimate of smoothing parameter.
@ -1361,7 +1385,7 @@ def mkernel(X, kernel):
    See also
    --------
-    kde, kdefun, kdebin
+    KDE
    References
    ----------
--- a/wafo/spectrum/core.py
+++ b/wafo/spectrum/core.py
@ -1131,7 +1131,7 @@ class SpecData1D(PlotData):
                         title='Joint density of maximum and minimum')
        try:
            pl = [10, 30, 50, 70, 90, 95, 99, 99.9]
-            mmpdf.cl = qlevels(uvdens, pl, h, h)
+            mmpdf.cl = qlevels(uvdens, pl, xi=(h, h))
            mmpdf.pl = pl
        except:
            pass
--- a/wafo/tests/test_kdetools.py
+++ b/wafo/tests/test_kdetools.py
@ -449,6 +449,27 @@ class TestSmoothing(unittest.TestCase):
        assert_allclose(hs, [[3.25196193e-01, -2.68892467e-02, 3.18932448e-04],
                             [-2.68892467e-02, 3.91283306e-01, 2.38654678e-02],
                             [3.18932448e-04, 2.38654678e-02, 4.05123874e-01]])
        hs = self.gauss.hmns(self.data[0])
        assert_allclose(hs, self.gauss.hns(self.data[0]))
        hs = wk.Kernel('epan').hmns(self.data)
        assert_allclose(hs,
                        [[8.363847e-01, -6.915749e-02, 8.202747e-04],
                         [-6.915749e-02, 1.006357e+00, 6.138052e-02],
                         [8.202747e-04, 6.138052e-02, 1.041954e+00]],
                        rtol=1e-5)
        hs = wk.Kernel('biwe').hmns(self.data[:2])
        assert_allclose(hs, [[0.868428, -0.071705],
                             [-0.071705, 1.04685]], rtol=1e-5)
        hs = wk.Kernel('triwe').hmns(self.data[:2])
        assert_allclose(hs, [[0.975375, -0.080535],
                             [-0.080535, 1.17577]], rtol=1e-5)
        self.assertRaises(NotImplementedError,
                          wk.Kernel('biwe').hmns, self.data)
        self.assertRaises(NotImplementedError,
                          wk.Kernel('triwe').hmns, self.data)
        self.assertRaises(NotImplementedError,
                          wk.Kernel('triangular').hmns, self.data)
    def test_hscv(self):
        hs = self.gauss.hscv(self.data)