fixed a bug in Kernel.name and simplified glevels and made it general.

9 years ago · 669fadc9a5
parent 40bfa22d46
commit 669fadc9a5
3 changed files with 174 additions and 129 deletions
--- a/wafo/kdetools/kernels.py
+++ b/wafo/kdetools/kernels.py
@ -22,6 +22,12 @@ def _assert(cond, msg):
    if not cond:
        raise ValueError(msg)

+
+def _assert_warn(cond, msg):
+    if not cond:
+        warnings.warn(msg)
+
+
 # stats = (mu2, R, Rdd) where
 #     mu2 : 2'nd order moment, i.e.,int(x^2*kernel(x))
 #     R :  integral of squared kernel, i.e., int(kernel(x)^2)
@ -36,31 +42,39 @@ _stats_logi = (pi ** 2 / 3, 1. / 6, 1 / 42)
 _stats_gaus = (1, 1. / (2 * sqrt(pi)), 3. / (8 * sqrt(pi)))


-def qlevels(pdf, p=(10, 30, 50, 70, 90, 95, 99, 99.9), x1=None, x2=None):
-    """QLEVELS Calculates quantile levels which encloses P% of PDF.
+def qlevels(pdf, p=(10, 30, 50, 70, 90, 95, 99, 99.9), xi=(), indexing='xy'):
+    """QLEVELS Calculates quantile levels which encloses P% of pdf.

-      CALL: [ql PL] = qlevels(pdf,PL,x1,x2);
+    Parameters
+    ----------
+    pdf: array-like
+        joint point density function given as array or vector
+    p : float in range of [0,100] (or sequence of floats)
+        Percentage to compute which must be between 0 and 100 inclusive.
+    xi : tuple
+        input arguments to the pdf, i.e., (x0, x1,...., xn)
+    indexing : {'xy', 'ij'}, optional
+        Cartesian ('xy', default) or matrix ('ij') indexing of pdf.
+        See numpy.meshgrid for more details.

-            ql    = the discrete quantile levels.
-            pdf   = joint point density function matrix or vector
-            PL    = percent level (default [10:20:90 95 99 99.9])
-            x1,x2 = vectors of the spacing of the variables
-                   (Default unit spacing)
+    Returns
+    ------
+    levels: array-like
+        discrete levels which encloses P% of pdf

    QLEVELS numerically integrates PDF by decreasing height and find the
-    quantile levels which  encloses P% of the distribution. If X1 and
-    (or) X2 is unspecified it is assumed that dX1 and dX2 is constant.
-    NB! QLEVELS normalizes the integral of PDF to N/(N+0.001) before
-    calculating QL in order to reflect the sampling of PDF is finite.
-    Currently only able to handle 1D and 2D PDF's if dXi is not constant
-    (i=1,2).
+    quantile levels which  encloses P% of the distribution.
+
+    If Xi is unspecified it is assumed that dX0, dX1,..., and dXn is constant.
+    NB! QLEVELS normalizes the integral of PDF to n/(n+0.001) before
+    calculating 'levels' in order to reflect the sampling of PDF is finite.

    Example
    -------
    >>> import wafo.stats as ws
    >>> x = np.linspace(-8,8,2001);
    >>> PL = np.r_[10:90:20, 90, 95, 99, 99.9]
-    >>> qlevels(ws.norm.pdf(x),p=PL, x1=x);
+    >>> qlevels(ws.norm.pdf(x),p=PL, xi=(x,));
    array([ 0.39591707,  0.37058719,  0.31830968,  0.23402133,  0.10362052,
            0.05862129,  0.01449505,  0.00178806])

@ -74,73 +88,58 @@ def qlevels(pdf, p=(10, 30, 50, 70, 90, 95, 99, 99.9), x1=None, x2=None):
    qlevels2, tranproc

    """
+    def _dx(x):
+        dx = np.diff(x.ravel()) * 0.5
+        return np.r_[0, dx] + np.r_[dx, 0]
+
+    def _init(pdf, xi, indexing):
+        if not xi:
+            return pdf.ravel()
+        if not isinstance(xi, tuple):
+            xi = (xi,)
+        dx = np.meshgrid(*[_dx(x) for x in xi], sparse=True, indexing=indexing)
+        dxij = np.ones((1))
+        for dxi in dx:
+            dxij = dxij * dxi
+        _assert(dxij.shape == pdf.shape,
+                'Shape of pdf does not match the arguments')
+        return (pdf * dxij).ravel()
+
+    def _check_levels(levels, pdf):
+        _assert_warn(not np.any(levels >= max(pdf.ravel())),
+                     'The lowest percent level is too close to 0%')
+        _assert_warn(not np.any(levels <= min(pdf.ravel())),
+                     'The given pdf is too sparsely sampled or the highest '
+                     'percent level is too close to 100%')
+
+    pdf, p = np.atleast_1d(pdf, p)
+    _assert(not any(pdf.ravel() < 0),
+            'This is not a pdf since one or more values of pdf is negative')
+    _assert(not np.any((p < 0) | (100 < p)), 'PL must satisfy 0 <= PL <= 100')

-    norm = 1  # normalize cdf to unity
-    pdf = np.atleast_1d(pdf)
-    _assert(not any(pdf.ravel() < 0), 'This is not a pdf since one or more '
-            'values of pdf is negative')
-
-    fsiz = pdf.shape
-    fsizmin = min(fsiz)
-    if fsizmin == 0:
+    if min(pdf.shape) == 0:
        return []

-    N = np.prod(fsiz)
-    d = len(fsiz)
-    if x1 is None or ((x2 is None) and d > 2):
-        fdfi = pdf.ravel()
-    else:
-        if d == 1:  # pdf in one dimension
-            dx22 = np.ones(1)
-        else:  # % pdf in two dimensions
-            dx2 = np.diff(x2.ravel()) * 0.5
-            dx22 = np.r_[0, dx2] + np.r_[dx2, 0]
-
-        dx1 = np.diff(x1.ravel()) * 0.5
-        dx11 = np.r_[0, dx1] + np.r_[dx1, 0]
-        dx1x2 = dx22[:, None] * dx11
-        fdfi = (pdf * dx1x2).ravel()
-
-    p = np.atleast_1d(p)
-    _assert(not np.any((p < 0) | (100 < p)), 'PL must satisfy 0 <= PL <= 100')
-
-    p2 = p / 100.0
    ind = np.argsort(pdf.ravel())  # sort by height of pdf
    ind = ind[::-1]
-    fi = pdf.flat[ind]
+    sorted_pdf = pdf.flat[ind]

+    pdf_dx = _init(pdf, xi, indexing=indexing)
    # integration in the order of decreasing height of pdf
-    Fi = np.cumsum(fdfi[ind])
+    cdf = np.cumsum(pdf_dx[ind])
+    n = pdf_dx.size
+    # normalize cdf to make sure int pdf dx1 dx2 approx 1
+    cdf = cdf / cdf[-1] * n / (n + 1.5e-8)

-    if norm:  # normalize Fi to make sure int pdf dx1 dx2 approx 1
-        Fi = Fi / Fi[-1] * N / (N + 1.5e-8)
+    # make sure cdf is strictly increasing by not considering duplicate values
+    ind, = np.where(np.diff(np.r_[cdf, 1]) > 0)

-    maxFi = np.max(Fi)
-    if maxFi > 1:
-        warnings.warn('this is not a pdf since cdf>1! normalizing')
+    # calculating the inverse of cdf to find the levels
+    levels = tranproc(cdf[ind], sorted_pdf[ind], p / 100.0)

-        Fi = Fi / Fi[-1] * N / (N + 1.5e-8)
-
-    elif maxFi < .95:
-        msg = '''The given pdf is too sparsely sampled since cdf<.95.
-        Thus QL is questionable'''
-        warnings.warn(msg)
-
-    # make sure Fi is strictly increasing by not considering duplicate values
-    ind, = np.where(np.diff(np.r_[Fi, 1]) > 0)
-    # calculating the inverse of Fi to find the index
-    ui = tranproc(Fi[ind], fi[ind], p2)
-
-    if np.any(ui >= max(pdf.ravel())):
-        warnings.warn('The lowest percent level is too close to 0%')
-
-    if np.any(ui <= min(pdf.ravel())):
-        msg = '''The given pdf is too sparsely sampled or
-       the highest percent level is too close to 100%'''
-        warnings.warn(msg)
-        ui[ui < 0] = 0.0
-
-    return ui
+    _check_levels(levels, pdf)
+    levels[levels < 0] = 0.0
+    return levels


 def qlevels2(data, p=(10, 30, 50, 70, 90, 95, 99, 99.9), method=1):
@ -262,9 +261,16 @@ def sphere_volume(d, r=1.0):
 class _Kernel(object):
    __metaclass__ = ABCMeta

-    def __init__(self, r=1.0, stats=None):
-        self.r = r  # radius of kernel
+    def __init__(self, r=1.0, stats=None, name=''):
+        self.r = r  # radius of effective support of kernel
        self.stats = stats
+        if not name:
+            name = self.__class__.__name__.replace('_Kernel', '')
+        self._name = name
+
+    @property
+    def name(self):
+        return self._name

    def norm_factor(self, d=1, n=None):
        _assert(0 < d, "D")
@ -305,12 +311,12 @@ class _KernelMulti(_Kernel):
    p=0;  Sphere = rect for 1D
    p=1;  Multivariate Epanechnikov kernel.
    p=2;  Multivariate Bi-weight Kernel
-    p=3;  Multi variate Tri-weight Kernel
-    p=4;  Multi variate Four-weight Kernel
+    p=3;  Multivariate Tri-weight Kernel
+    p=4;  Multivariate Four-weight Kernel
    """
-    def __init__(self, r=1.0, p=1, stats=None):
+    def __init__(self, r=1.0, p=1, stats=None, name=''):
        self.p = p
-        super(_KernelMulti, self).__init__(r, stats)
+        super(_KernelMulti, self).__init__(r, stats, name)

    def norm_factor(self, d=1, n=None):
        r = self.r
@ -325,9 +331,10 @@ class _KernelMulti(_Kernel):
        x2 = x ** 2
        return ((1.0 - x2.sum(axis=0) / r ** 2).clip(min=0.0)) ** p

-mkernel_epanechnikov = _KernelMulti(p=1, stats=_stats_epan)
-mkernel_biweight = _KernelMulti(p=2, stats=_stats_biwe)
-mkernel_triweight = _KernelMulti(p=3, stats=_stats_triw)
+mkernel_epanechnikov = _KernelMulti(p=1, stats=_stats_epan,
+                                    name='epanechnikov')
+mkernel_biweight = _KernelMulti(p=2, stats=_stats_biwe, name='biweight')
+mkernel_triweight = _KernelMulti(p=3, stats=_stats_triw, name='triweight')


 class _KernelProduct(_KernelMulti):
@ -350,9 +357,11 @@ class _KernelProduct(_KernelMulti):
        pdf = (1 - (x / r) ** 2).clip(min=0.0) ** self.p
        return pdf.prod(axis=0)

-mkernel_p1epanechnikov = _KernelProduct(p=1, stats=_stats_epan)
-mkernel_p1biweight = _KernelProduct(p=2, stats=_stats_biwe)
-mkernel_p1triweight = _KernelProduct(p=3, stats=_stats_triw)
+mkernel_p1epanechnikov = _KernelProduct(p=1, stats=_stats_epan,
+                                        name='p1epanechnikov')
+mkernel_p1biweight = _KernelProduct(p=2, stats=_stats_biwe, name='p1biweight')
+mkernel_p1triweight = _KernelProduct(p=3, stats=_stats_triw,
+                                     name='p1triweight')


 class _KernelRectangular(_Kernel):
@ -404,11 +413,6 @@ class _KernelGaussian(_Kernel):

 mkernel_gaussian = _KernelGaussian(r=4.0, stats=_stats_gaus)

-# def mkernel_gaussian(X):
-#    x2 = X ** 2
-#    d = X.shape[0]
-#    return (2 * pi) ** (-d / 2) * exp(-0.5 * x2.sum(axis=0))
-

 class _KernelLaplace(_Kernel):

@ -439,8 +443,8 @@ _MKERNEL_DICT = dict(
    tria=mkernel_triangular,
    lapl=mkernel_laplace,
    logi=mkernel_logistic,
-    gaus=mkernel_gaussian
-)
+    gaus=mkernel_gaussian)
+
 _KERNEL_EXPONENT_DICT = dict(
    re=0, sp=0, ep=1, bi=2, tr=3, fo=4, fi=5, si=6, se=7)

@ -530,7 +534,7 @@ class Kernel(object):

    @property
    def name(self):
-        return self.kernel.__class__.__name__.replace('_Kernel', '').title()
+        return self.kernel.name

    def stats(self):
        """Return some 1D statistics of the kernel.
@ -586,8 +590,13 @@ class Kernel(object):
        visual check by eye.

        Example:
-          data = rndnorm(0, 1,20,1)
-          h = hns(data,'epan')
+        -------
+        >>> import numpy as np
+        >>> import wafo.kdetools as wk
+        >>> import wafo.stats as ws
+        >>> kernel = wk.Kernel('epan')
+        >>> data = ws.norm.rvs(0, 1, size=(1,20))
+        >>> h = kernel.hns(data)

        See also:
        ---------
@ -601,7 +610,6 @@ class Kernel(object):
        Wand,M.P. and Jones, M.C. (1995)
        'Kernel smoothing'
        Chapman and Hall, pp 60--63
-
        """

        a = np.atleast_2d(data)
@ -611,13 +619,13 @@ class Kernel(object):
        mu2, R, _Rdd = self.stats()
        amise_constant = (8 * sqrt(pi) * R / (3 * mu2 ** 2 * n)) ** (1. / 5)
        iqr = iqrange(a, axis=1)  # interquartile range
-        stdA = np.std(a, axis=1, ddof=1)
+        std_a = np.std(a, axis=1, ddof=1)
        # use of interquartile range guards against outliers.
        # the use of interquartile range is better if
        # the distribution is skew or have heavy tails
        # This lessen the chance of oversmoothing.
        return np.where(iqr > 0,
-                        np.minimum(stdA, iqr / 1.349), stdA) * amise_constant
+                        np.minimum(std_a, iqr / 1.349), std_a) * amise_constant

    def hos(self, data):
        """Returns Oversmoothing Parameter.
@ -680,7 +688,8 @@ class Kernel(object):
        elif name == 'gaus':  # Gaussian kernel
            a = (4.0 / (d + 2.0)) ** (1. / (d + 4.0))
        else:
-            raise ValueError('Unknown kernel.')
+            raise NotImplementedError('Hmns bandwidth not implemented for '
+                                      'kernel {}.'.format(name))
        return a

    def hmns(self, data):
@ -696,7 +705,6 @@ class Kernel(object):
                    'triweight'     - Tri-weight kernel.
                    'gaussian'      - Gaussian kernel

-          Note that only the first 4 letters of the kernel name is needed.

         HMNS  only gives  a optimal value with respect to mean integrated
         square error, when the true underlying distribution is Multivariate
@ -725,7 +733,6 @@ class Kernel(object):
          Chapman and Hall, pp 60--63, 86--88

        """
-        # TODO: implement more kernels

        a = np.atleast_2d(data)
        d, n = a.shape
@ -1081,35 +1088,53 @@ class Kernel(object):
                warnings.warn('The obtained value did not converge.')

            h[dim] = h1 * s
-        # end % for dim loop
+        # end # for dim loop
        return h

    def hscv(self, data, hvec=None, inc=128, maxit=100, fulloutput=False):
        '''
        HSCV Smoothed cross-validation estimate of smoothing parameter.

-         CALL: [hs,hvec,score] = hscv(data,kernel,hvec)

-           hs     = smoothing parameter
+        Parameters
+        ----------
+        data   = data vector
        hvec   = vector defining possible values of hs
                     (default linspace(0.25*h0,h0,100), h0=0.62)
+        inc = length of estimated kerneldensity estimate
+        maxit = maximum number of iterations
+        fulloutput = True if fulloutput is wanted
+
+        Returns
+        -------
+        hs     = smoothing parameter
+        hvec   = vector defining possible values of hs
        score  = score vector
-           data   = data vector
-           kernel = 'gaussian'      - Gaussian kernel the only supported

-          Note that only the first 4 letters of the kernel name is needed.
+          Example
+          ------
+          >>> import wafo.kdetools as wk
+          >>> import wafo.stats as ws
+          >>> data = ws.norm.rvs(0,1, size=(1,20))
+          >>> kernel = wk.Kernel('epan')
+          >>> hs0 = kernel.hscv(data, fulloutput=False)
+          >>> hs, hvec, score = kernel.hscv(data, fulloutput=True)
+          >>> np.allclose(hs, hs0)
+          True

-          Example:
-            data = rndnorm(0,1,20,1)
-             [hs hvec score] = hscv(data,'epan');
-             plot(hvec,score)
-         See also  hste, hbcv, hboot, hos, hldpi, hlscv, hstt, kde, kdefun
+         import matplotlib.pyplot as plt
+         plt.plot(hvec,score)

+         See also:
+         hste, hbcv, hboot, hos, hldpi, hlscv, hstt, kde, kdefun
+
+        Reference
+        ---------
        Wand,M.P. and Jones, M.C. (1986)
        'Kernel smoothing'
        Chapman and Hall, pp 75--79
        '''
-        # TODO: Add support for other kernels than Gaussian
+
        A = np.atleast_2d(data)
        d, n = A.shape

@ -1209,17 +1234,16 @@ class Kernel(object):
            idx = score.argmin()
            # Kernel other than Gaussian scale bandwidth
            h[dim] = hvec[idx] * (ste_constant / ste_constant2) ** (1 / 5)
-            if idx == 0:
-                warnings.warn("Optimum is probably lower than "
+            _assert_warn(0 < idx,
+                         "Optimum is probably lower than "
+                         "hs={0:g} for dim={1:d}".format(h[dim] * s, dim))
+            _assert_warn(idx < maxit - 1,
+                         "Optimum is probably higher than "
                         "hs={0:g} for dim={1:d}".format(h[dim] * s, dim))
-            elif idx == maxit - 1:
-                msg = "Optimum is probably higher than hs={0:g] for dim={1:d}"
-                warnings.warn(msg.format(h[dim] * s, dim))

        hvec = hvec * (ste_constant / ste_constant2) ** (1 / 5)
        if fulloutput:
-            return h * sigmaA, score, hvec, sigmaA
-        else:
+            return h * sigmaA, score, hvec
        return h * sigmaA

    def hldpi(self, data, L=2, inc=128):
@ -1361,7 +1385,7 @@ def mkernel(X, kernel):

    See also
    --------
-    kde, kdefun, kdebin
+    KDE

    References
    ----------
--- a/wafo/spectrum/core.py
+++ b/wafo/spectrum/core.py
@ -1131,7 +1131,7 @@ class SpecData1D(PlotData):
                         title='Joint density of maximum and minimum')
        try:
            pl = [10, 30, 50, 70, 90, 95, 99, 99.9]
-            mmpdf.cl = qlevels(uvdens, pl, h, h)
+            mmpdf.cl = qlevels(uvdens, pl, xi=(h, h))
            mmpdf.pl = pl
        except:
            pass
--- a/wafo/tests/test_kdetools.py
+++ b/wafo/tests/test_kdetools.py
@ -449,6 +449,27 @@ class TestSmoothing(unittest.TestCase):
        assert_allclose(hs, [[3.25196193e-01, -2.68892467e-02, 3.18932448e-04],
                             [-2.68892467e-02, 3.91283306e-01, 2.38654678e-02],
                             [3.18932448e-04, 2.38654678e-02, 4.05123874e-01]])
+        hs = self.gauss.hmns(self.data[0])
+        assert_allclose(hs, self.gauss.hns(self.data[0]))
+
+        hs = wk.Kernel('epan').hmns(self.data)
+        assert_allclose(hs,
+                        [[8.363847e-01, -6.915749e-02, 8.202747e-04],
+                         [-6.915749e-02, 1.006357e+00, 6.138052e-02],
+                         [8.202747e-04, 6.138052e-02, 1.041954e+00]],
+                        rtol=1e-5)
+        hs = wk.Kernel('biwe').hmns(self.data[:2])
+        assert_allclose(hs, [[0.868428, -0.071705],
+                             [-0.071705, 1.04685]], rtol=1e-5)
+        hs = wk.Kernel('triwe').hmns(self.data[:2])
+        assert_allclose(hs, [[0.975375, -0.080535],
+                             [-0.080535, 1.17577]], rtol=1e-5)
+        self.assertRaises(NotImplementedError,
+                          wk.Kernel('biwe').hmns, self.data)
+        self.assertRaises(NotImplementedError,
+                          wk.Kernel('triwe').hmns, self.data)
+        self.assertRaises(NotImplementedError,
+                          wk.Kernel('triangular').hmns, self.data)

    def test_hscv(self):
        hs = self.gauss.hscv(self.data)