From 67e15d7c8c5d2e9fde4ac71a28f2704b245fbcaa Mon Sep 17 00:00:00 2001 From: Per A Brodtkorb Date: Tue, 20 Dec 2016 11:45:29 +0100 Subject: [PATCH] Refactored kdetools.py into a subpackage. Simplified and removed obsolete code. --- wafo/kdetools.py | 4071 ----------------------------------- wafo/kdetools/__init__.py | 3 + wafo/kdetools/gridding.py | 324 +++ wafo/kdetools/kdetools.py | 2139 ++++++++++++++++++ wafo/kdetools/kernels.py | 1382 ++++++++++++ wafo/testing.py | 18 + wafo/tests/test_kdetools.py | 865 ++++---- 7 files changed, 4336 insertions(+), 4466 deletions(-) delete mode 100644 wafo/kdetools.py create mode 100644 wafo/kdetools/__init__.py create mode 100644 wafo/kdetools/gridding.py create mode 100644 wafo/kdetools/kdetools.py create mode 100644 wafo/kdetools/kernels.py create mode 100644 wafo/testing.py diff --git a/wafo/kdetools.py b/wafo/kdetools.py deleted file mode 100644 index c63ba99..0000000 --- a/wafo/kdetools.py +++ /dev/null @@ -1,4071 +0,0 @@ -#!/usr/bin/env python -# ------------------------------------------------------------------------- -# Name: kdetools -# Purpose: -# -# Author: pab -# -# Created: 01.11.2008 -# Copyright: (c) pab 2008 -# Licence: LGPL -# ------------------------------------------------------------------------- - -from __future__ import absolute_import, division -import copy -import numpy as np -import scipy -import warnings -from itertools import product -from scipy import interpolate, linalg, optimize, sparse, special, stats -from scipy.special import gamma -from numpy import pi, sqrt, atleast_2d, exp, newaxis # @UnresolvedImport - -from wafo.misc import meshgrid, nextpow2, tranproc # , trangood -from wafo.containers import PlotData -from wafo.dctpack import dct, dctn, idctn -from wafo.plotbackend import plotbackend as plt -try: - from wafo import fig -except ImportError: - warnings.warn('fig import only supported on Windows') - - -def _invnorm(q): - return special.ndtri(q) - -_stats_epan = (1. / 5, 3. / 5, np.inf) -_stats_biwe = (1. / 7, 5. / 7, 45. / 2) -_stats_triw = (1. / 9, 350. / 429, np.inf) -_stats_rect = (1. / 3, 1. / 2, np.inf) -_stats_tria = (1. / 6, 2. / 3, np.inf) -_stats_lapl = (2, 1. / 4, np.inf) -_stats_logi = (pi ** 2 / 3, 1. / 6, 1 / 42) -_stats_gaus = (1, 1. / (2 * sqrt(pi)), 3. / (8 * sqrt(pi))) - -__all__ = ['sphere_volume', 'TKDE', 'KDE', 'Kernel', 'accum', 'qlevels', - 'iqrange', 'gridcount', 'kde_demo1', 'kde_demo2', 'test_docstrings'] - - -def sphere_volume(d, r=1.0): - """ - Returns volume of d-dimensional sphere with radius r - - Parameters - ---------- - d : scalar or array_like - dimension of sphere - r : scalar or array_like - radius of sphere (default 1) - - Example - ------- - >>> sphere_volume(2., r=2.) - 12.566370614359172 - >>> sphere_volume(2., r=1.) - 3.1415926535897931 - - Reference - --------- - Wand,M.P. and Jones, M.C. (1995) - 'Kernel smoothing' - Chapman and Hall, pp 105 - """ - return (r ** d) * 2.0 * pi ** (d / 2.0) / (d * gamma(d / 2.0)) - - -class KDEgauss(object): - - """ Kernel-Density Estimator base class. - - Parameters - ---------- - data : (# of dims, # of data)-array - datapoints to estimate from - hs : array-like (optional) - smooting parameter vector/matrix. - (default compute from data using kernel.get_smoothing function) - alpha : real scalar (optional) - sensitivity parameter (default 0 regular KDE) - A good choice might be alpha = 0.5 ( or 1/D) - alpha = 0 Regular KDE (hs is constant) - 0 < alpha <= 1 Adaptive KDE (Make hs change) - - - Members - ------- - d : int - number of dimensions - n : int - number of datapoints - - Methods - ------- - kde.eval_grid_fast(x0, x1,..., xd) : array - evaluate the estimated pdf on meshgrid(x0, x1,..., xd) - kde(x0, x1,..., xd) : array - same as kde.eval_grid_fast(x0, x1,..., xd) - """ - - def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None, - xmax=None, inc=512): - self.dataset = atleast_2d(data) - self.hs = hs - self.kernel = kernel if kernel else Kernel('gauss') - self.alpha = alpha - self.xmin = xmin - self.xmax = xmax - self.inc = inc - self.initialize() - - def initialize(self): - self.d, self.n = self.dataset.shape - self._set_xlimits() - self._initialize() - - def _initialize(self): - self._compute_smoothing() - - def _compute_smoothing(self): - """Computes the smoothing matrix.""" - get_smoothing = self.kernel.get_smoothing - h = self.hs - if h is None: - h = get_smoothing(self.dataset) - h = np.atleast_1d(h) - hsiz = h.shape - - if (len(hsiz) == 1) or (self.d == 1): - if max(hsiz) == 1: - h = h * np.ones(self.d) - else: - h.shape = (self.d,) # make sure it has the correct dimension - - # If h negative calculate automatic values - ind, = np.where(h <= 0) - for i in ind.tolist(): - h[i] = get_smoothing(self.dataset[i]) - deth = h.prod() - self.inv_hs = np.diag(1.0 / h) - else: # fully general smoothing matrix - deth = linalg.det(h) - if deth <= 0: - raise ValueError( - 'bandwidth matrix h must be positive definit!') - self.inv_hs = linalg.inv(h) - self.hs = h - self._norm_factor = deth * self.n - - def _set_xlimits(self): - amin = self.dataset.min(axis=-1) - amax = self.dataset.max(axis=-1) - iqr = iqrange(self.dataset, axis=-1) - sigma = np.minimum(np.std(self.dataset, axis=-1, ddof=1), iqr / 1.34) - # xyzrange = amax - amin - # offset = xyzrange / 4.0 - offset = 2 * sigma - if self.xmin is None: - self.xmin = amin - offset - else: - self.xmin = self.xmin * np.ones((self.d, 1)) - if self.xmax is None: - self.xmax = amax + offset - else: - self.xmax = self.xmax * np.ones((self.d, 1)) - - def eval_grid_fast(self, *args, **kwds): - """Evaluate the estimated pdf on a grid. - - Parameters - ---------- - arg_0,arg_1,... arg_d-1 : vectors - Alternatively, if no vectors is passed in then - arg_i = linspace(self.xmin[i], self.xmax[i], self.inc) - output : string optional - 'value' if value output - 'data' if object output - - Returns - ------- - values : array-like - The values evaluated at meshgrid(*args). - - """ - if len(args) == 0: - args = [] - for i in range(self.d): - args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc)) - self.args = args - return self._eval_grid_fun(self._eval_grid_fast, *args, **kwds) - - def _eval_grid_fast(self, *args, **kwds): - X = np.vstack(args) - d, inc = X.shape - # dx = X[:, 1] - X[:, 0] - R = X.max(axis=-1) - X.min(axis=-1) - - t_star = (self.hs / R) ** 2 - I = (np.asfarray(np.arange(0, inc)) * pi) ** 2 - In = [] - - for i in range(d): - In.append(I * t_star[i] * 0.5) - - Inc = meshgrid(*In) if d > 1 else In - - kw = np.zeros((inc,) * d) - for i in range(d): - kw += exp(-Inc[i]) - y = kwds.get('y', 1.0) - d, n = self.dataset.shape - # Find the binned kernel weights, c. - c = gridcount(self.dataset, X, y=y) / n - # Perform the convolution. - at = dctn(c) * kw - z = idctn(at) * at.size / np.prod(R) - return z * (z > 0.0) - - def _eval_grid_fun(self, eval_grd, *args, **kwds): - output = kwds.pop('output', 'value') - f = eval_grd(*args, **kwds) - if output == 'value': - return f - else: - titlestr = 'Kernel density estimate (%s)' % self.kernel.name - kwds2 = dict(title=titlestr) - kwds2['plot_kwds'] = dict(plotflag=1) - kwds2.update(**kwds) - args = self.args - if self.d == 1: - args = args[0] - wdata = PlotData(f, args, **kwds2) - if self.d > 1: - PL = np.r_[10:90:20, 95, 99, 99.9] - try: - ql = qlevels(f, p=PL) - wdata.clevels = ql - wdata.plevels = PL - except: - pass - return wdata - - def _check_shape(self, points): - points = atleast_2d(points) - d, m = points.shape - if d != self.d: - if d == 1 and m == self.d: - # points was passed in as a row vector - points = np.reshape(points, (self.d, 1)) - else: - msg = "points have dimension %s, dataset has dimension %s" - raise ValueError(msg % (d, self.d)) - return points - - def eval_points(self, points, **kwds): - """Evaluate the estimated pdf on a set of points. - - Parameters - ---------- - points : (# of dimensions, # of points)-array - Alternatively, a (# of dimensions,) vector can be passed in and - treated as a single point. - - Returns - ------- - values : (# of points,)-array - The values at each point. - - Raises - ------ - ValueError if the dimensionality of the input points is different than - the dimensionality of the KDE. - - """ - - points = self._check_shape(points) - return self._eval_points(points, **kwds) - - def _eval_points(self, points, **kwds): - pass - - __call__ = eval_grid_fast - - -class _KDE(object): - - """ Kernel-Density Estimator base class. - - Parameters - ---------- - data : (# of dims, # of data)-array - datapoints to estimate from - hs : array-like (optional) - smooting parameter vector/matrix. - (default compute from data using kernel.get_smoothing function) - kernel : kernel function object. - kernel must have get_smoothing method - alpha : real scalar (optional) - sensitivity parameter (default 0 regular KDE) - A good choice might be alpha = 0.5 ( or 1/D) - alpha = 0 Regular KDE (hs is constant) - 0 < alpha <= 1 Adaptive KDE (Make hs change) - - Members - ------- - d : int - number of dimensions - n : int - number of datapoints - - Methods - ------- - kde.eval_grid_fast(x0, x1,..., xd) : array - evaluate the estimated pdf on meshgrid(x0, x1,..., xd) - kde.eval_grid(x0, x1,..., xd) : array - evaluate the estimated pdf on meshgrid(x0, x1,..., xd) - kde.eval_points(points) : array - evaluate the estimated pdf on a provided set of points - kde(x0, x1,..., xd) : array - same as kde.eval_grid(x0, x1,..., xd) - """ - - def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None, - xmax=None, inc=512): - self.dataset = atleast_2d(data) - self.hs = hs - self.kernel = kernel if kernel else Kernel('gauss') - self.alpha = alpha - self.xmin = xmin - self.xmax = xmax - self.inc = inc - self.initialize() - - def initialize(self): - self.d, self.n = self.dataset.shape - if self.n > 1: - self._set_xlimits() - self._initialize() - - def _initialize(self): - pass - - def _set_xlimits(self): - amin = self.dataset.min(axis=-1) - amax = self.dataset.max(axis=-1) - iqr = iqrange(self.dataset, axis=-1) - self._sigma = np.minimum( - np.std(self.dataset, axis=-1, ddof=1), iqr / 1.34) - # xyzrange = amax - amin - # offset = xyzrange / 4.0 - offset = self._sigma - if self.xmin is None: - self.xmin = amin - offset - else: - self.xmin = self.xmin * np.ones((self.d, 1)) - if self.xmax is None: - self.xmax = amax + offset - else: - self.xmax = self.xmax * np.ones((self.d, 1)) - - def get_args(self, xmin=None, xmax=None): - if xmin is None: - xmin = self.xmin - else: - xmin = [min(i, j) for i, j in zip(xmin, self.xmin)] - if xmax is None: - xmax = self.xmax - else: - xmax = [max(i, j) for i, j in zip(xmax, self.xmax)] - args = [] - for i in range(self.d): - args.append(np.linspace(xmin[i], xmax[i], self.inc)) - return args - - def eval_grid_fast(self, *args, **kwds): - """Evaluate the estimated pdf on a grid. - - Parameters - ---------- - arg_0,arg_1,... arg_d-1 : vectors - Alternatively, if no vectors is passed in then - arg_i = linspace(self.xmin[i], self.xmax[i], self.inc) - output : string optional - 'value' if value output - 'data' if object output - - Returns - ------- - values : array-like - The values evaluated at meshgrid(*args). - - """ - if len(args) == 0: - args = self.get_args() - self.args = args - return self._eval_grid_fun(self._eval_grid_fast, *args, **kwds) - - def _eval_grid_fast(self, *args, **kwds): - pass - - def eval_grid(self, *args, **kwds): - """Evaluate the estimated pdf on a grid. - - Parameters - ---------- - arg_0,arg_1,... arg_d-1 : vectors - Alternatively, if no vectors is passed in then - arg_i = linspace(self.xmin[i], self.xmax[i], self.inc) - output : string optional - 'value' if value output - 'data' if object output - - Returns - ------- - values : array-like - The values evaluated at meshgrid(*args). - - """ - if len(args) == 0: - args = [] - for i in range(self.d): - args.append(np.linspace(self.xmin[i], self.xmax[i], self.inc)) - self.args = args - return self._eval_grid_fun(self._eval_grid, *args, **kwds) - - def _eval_grid(self, *args): - pass - - def _eval_grid_fun(self, eval_grd, *args, **kwds): - output = kwds.pop('output', 'value') - f = eval_grd(*args, **kwds) - if output == 'value': - return f - else: - titlestr = 'Kernel density estimate (%s)' % self.kernel.name - kwds2 = dict(title=titlestr) - - kwds2['plot_kwds'] = kwds.pop('plot_kwds', dict(plotflag=1)) - kwds2.update(**kwds) - args = self.args - if self.d == 1: - args = args[0] - wdata = PlotData(f, args, **kwds2) - if self.d > 1: - PL = np.r_[10:90:20, 95, 99, 99.9] - try: - ql = qlevels(f, p=PL) - wdata.clevels = ql - wdata.plevels = PL - except: - pass - return wdata - - def _check_shape(self, points): - points = atleast_2d(points) - d, m = points.shape - if d != self.d: - if d == 1 and m == self.d: - # points was passed in as a row vector - points = np.reshape(points, (self.d, 1)) - else: - msg = "points have dimension %s, dataset has dimension %s" - raise ValueError(msg % (d, self.d)) - return points - - def eval_points(self, points, **kwds): - """Evaluate the estimated pdf on a set of points. - - Parameters - ---------- - points : (# of dimensions, # of points)-array - Alternatively, a (# of dimensions,) vector can be passed in and - treated as a single point. - - Returns - ------- - values : (# of points,)-array - The values at each point. - - Raises - ------ - ValueError if the dimensionality of the input points is different than - the dimensionality of the KDE. - - """ - - points = self._check_shape(points) - return self._eval_points(points, **kwds) - - def _eval_points(self, points, **kwds): - pass - - __call__ = eval_grid - - -class TKDE(_KDE): - - """ Transformation Kernel-Density Estimator. - - Parameters - ---------- - dataset : (# of dims, # of data)-array - datapoints to estimate from - hs : array-like (optional) - smooting parameter vector/matrix. - (default compute from data using kernel.get_smoothing function) - kernel : kernel function object. - kernel must have get_smoothing method - alpha : real scalar (optional) - sensitivity parameter (default 0 regular KDE) - A good choice might be alpha = 0.5 ( or 1/D) - alpha = 0 Regular KDE (hs is constant) - 0 < alpha <= 1 Adaptive KDE (Make hs change) - xmin, xmax : vectors - specifying the default argument range for the kde.eval_grid methods. - For the kde.eval_grid_fast methods the values must cover the range of - the data. (default min(data)-range(data)/4, max(data)-range(data)/4) - If a single value of xmin or xmax is given then the boundary is the is - the same for all dimensions. - inc : scalar integer - defining the default dimension of the output from kde.eval_grid methods - (default 512) - (For kde.eval_grid_fast: A value below 50 is very fast to compute but - may give some inaccuracies. Values between 100 and 500 give very - accurate results) - L2 : array-like - vector of transformation parameters (default 1 no transformation) - t(xi;L2) = xi^L2*sign(L2) for L2(i) ~= 0 - t(xi;L2) = log(xi) for L2(i) == 0 - If single value of L2 is given then the transformation is the same in - all directions. - - Members - ------- - d : int - number of dimensions - n : int - number of datapoints - - Methods - ------- - kde.eval_grid_fast(x0, x1,..., xd) : array - evaluate the estimated pdf on meshgrid(x0, x1,..., xd) - kde.eval_grid(x0, x1,..., xd) : array - evaluate the estimated pdf on meshgrid(x0, x1,..., xd) - kde.eval_points(points) : array - evaluate the estimated pdf on a provided set of points - kde(x0, x1,..., xd) : array - same as kde.eval_grid(x0, x1,..., xd) - - Example - ------- - N = 20 - data = np.random.rayleigh(1, size=(N,)) - >>> data = np.array([ - ... 0.75355792, 0.72779194, 0.94149169, 0.07841119,2.32291887, - ... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, - ... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, - ... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452]) - - >>> import wafo.kdetools as wk - >>> x = np.linspace(0.01, max(data.ravel()) + 1, 10) - >>> kde = wk.TKDE(data, hs=0.5, L2=0.5) - >>> f = kde(x) - >>> f - array([ 1.03982714, 0.45839018, 0.39514782, 0.32860602, 0.26433318, - 0.20717946, 0.15907684, 0.1201074 , 0.08941027, 0.06574882]) - - >>> kde.eval_grid(x) - array([ 1.03982714, 0.45839018, 0.39514782, 0.32860602, 0.26433318, - 0.20717946, 0.15907684, 0.1201074 , 0.08941027, 0.06574882]) - - >>> kde.eval_grid_fast(x) - array([ 1.04018924, 0.45838973, 0.39514689, 0.32860532, 0.26433301, - 0.20717976, 0.15907697, 0.1201077 , 0.08941129, 0.06574899]) - - import pylab as plb - h1 = plb.plot(x, f) # 1D probability density plot - t = np.trapz(f, x) - """ - - def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None, - xmax=None, inc=512, L2=None): - self.L2 = L2 - super(TKDE, self).__init__(data, hs, kernel, alpha, xmin, xmax, inc) - - def _initialize(self): - self._check_xmin() - tdataset = self._dat2gaus(self.dataset) - xmin = self.xmin - if xmin is not None: - xmin = self._dat2gaus(np.reshape(xmin, (-1, 1))) - xmax = self.xmax - if xmax is not None: - xmax = self._dat2gaus(np.reshape(xmax, (-1, 1))) - self.tkde = KDE(tdataset, self.hs, self.kernel, self.alpha, xmin, xmax, - self.inc) - if self.inc is None: - self.inc = self.tkde.inc - - def _check_xmin(self): - if self.L2 is not None: - amin = self.dataset.min(axis=-1) - # default no transformation - L2 = np.atleast_1d(self.L2) * np.ones(self.d) - self.xmin = np.where(L2 != 1, np.maximum( - self.xmin, amin / 100.0), self.xmin).reshape((-1, 1)) - - def _dat2gaus(self, points): - if self.L2 is None: - return points # default no transformation - - # default no transformation - L2 = np.atleast_1d(self.L2) * np.ones(self.d) - - tpoints = copy.copy(points) - for i, v2 in enumerate(L2.tolist()): - tpoints[i] = np.log(points[i]) if v2 == 0 else points[i] ** v2 - return tpoints - - def _gaus2dat(self, tpoints): - if self.L2 is None: - return tpoints # default no transformation - - # default no transformation - L2 = np.atleast_1d(self.L2) * np.ones(self.d) - - points = copy.copy(tpoints) - for i, v2 in enumerate(L2.tolist()): - points[i] = np.exp( - tpoints[i]) if v2 == 0 else tpoints[i] ** (1.0 / v2) - return points - - def _scale_pdf(self, pdf, points): - if self.L2 is None: - return pdf - # default no transformation - L2 = np.atleast_1d(self.L2) * np.ones(self.d) - for i, v2 in enumerate(L2.tolist()): - factor = v2 * np.sign(v2) if v2 else 1 - pdf *= np.where(v2 == 1, 1, points[i] ** (v2 - 1) * factor) - if (np.abs(np.diff(pdf)).max() > 10).any(): - msg = ''' Numerical problems may have occured due to the power - transformation. Check the KDE for spurious spikes''' - warnings.warn(msg) - return pdf - - def eval_grid_fast2(self, *args, **kwds): - """Evaluate the estimated pdf on a grid. - - Parameters - ---------- - arg_0,arg_1,... arg_d-1 : vectors - Alternatively, if no vectors is passed in then - arg_i = gauss2dat(linspace(dat2gauss(self.xmin[i]), - dat2gauss(self.xmax[i]), self.inc)) - output : string optional - 'value' if value output - 'data' if object output - - Returns - ------- - values : array-like - The values evaluated at meshgrid(*args). - - """ - return self._eval_grid_fun(self._eval_grid_fast, *args, **kwds) - - def _eval_grid_fast(self, *args, **kwds): - if self.L2 is None: - f = self.tkde.eval_grid_fast(*args, **kwds) - self.args = self.tkde.args - return f - targs = [] - if len(args): - targs0 = self._dat2gaus(list(args)) - xmin = [min(t) for t in targs0] - xmax = [max(t) for t in targs0] - targs = self.tkde.get_args(xmin, xmax) - tf = self.tkde.eval_grid_fast(*targs) - self.args = self._gaus2dat(list(self.tkde.args)) - points = meshgrid(*self.args) if self.d > 1 else self.args - f = self._scale_pdf(tf, points) - if len(args): - ipoints = meshgrid(*args) if self.d > 1 else args - # shape0 = points[0].shape - # shape0i = ipoints[0].shape - for i in range(self.d): - points[i].shape = (-1,) - # ipoints[i].shape = (-1,) - points = np.asarray(points).T - # ipoints = np.asarray(ipoints).T - fi = interpolate.griddata(points, f.ravel(), tuple(ipoints), - method='linear', - fill_value=0.0) - # fi.shape = shape0i - self.args = args - r = kwds.get('r', 0) - if r == 0: - return fi * (fi > 0) - else: - return fi - return f - - def _eval_grid(self, *args, **kwds): - if self.L2 is None: - return self.tkde.eval_grid(*args, **kwds) - targs = self._dat2gaus(list(args)) - tf = self.tkde.eval_grid(*targs, **kwds) - points = meshgrid(*args) if self.d > 1 else self.args - f = self._scale_pdf(tf, points) - return f - - def _eval_points(self, points): - """Evaluate the estimated pdf on a set of points. - - Parameters - ---------- - points : (# of dimensions, # of points)-array - Alternatively, a (# of dimensions,) vector can be passed in and - treated as a single point. - - Returns - ------- - values : (# of points,)-array - The values at each point. - - Raises - ------ - ValueError if the dimensionality of the input points is different than - the dimensionality of the KDE. - - """ - if self.L2 is None: - return self.tkde.eval_points(points) - - tpoints = self._dat2gaus(points) - tf = self.tkde.eval_points(tpoints) - f = self._scale_pdf(tf, points) - return f - - -class KDE(_KDE): - - """ Kernel-Density Estimator. - - Parameters - ---------- - data : (# of dims, # of data)-array - datapoints to estimate from - hs : array-like (optional) - smooting parameter vector/matrix. - (default compute from data using kernel.get_smoothing function) - kernel : kernel function object. - kernel must have get_smoothing method - alpha : real scalar (optional) - sensitivity parameter (default 0 regular KDE) - A good choice might be alpha = 0.5 ( or 1/D) - alpha = 0 Regular KDE (hs is constant) - 0 < alpha <= 1 Adaptive KDE (Make hs change) - xmin, xmax : vectors - specifying the default argument range for the kde.eval_grid methods. - For the kde.eval_grid_fast methods the values must cover the range of - the data. - (default min(data)-range(data)/4, max(data)-range(data)/4) - If a single value of xmin or xmax is given then the boundary is the is - the same for all dimensions. - inc : scalar integer (default 512) - defining the default dimension of the output from kde.eval_grid methods - (For kde.eval_grid_fast: A value below 50 is very fast to compute but - may give some inaccuracies. Values between 100 and 500 give very - accurate results) - - Members - ------- - d : int - number of dimensions - n : int - number of datapoints - - Methods - ------- - kde.eval_grid_fast(x0, x1,..., xd) : array - evaluate the estimated pdf on meshgrid(x0, x1,..., xd) - kde.eval_grid(x0, x1,..., xd) : array - evaluate the estimated pdf on meshgrid(x0, x1,..., xd) - kde.eval_points(points) : array - evaluate the estimated pdf on a provided set of points - kde(x0, x1,..., xd) : array - same as kde.eval_grid(x0, x1,..., xd) - - Example - ------- - N = 20 - data = np.random.rayleigh(1, size=(N,)) - >>> data = np.array([ - ... 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887, - ... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, - ... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, - ... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452]) - - >>> x = np.linspace(0, max(data.ravel()) + 1, 10) - >>> import wafo.kdetools as wk - >>> kde = wk.KDE(data, hs=0.5, alpha=0.5) - >>> f = kde(x) - >>> f - array([ 0.17252055, 0.41014271, 0.61349072, 0.57023834, 0.37198073, - 0.21409279, 0.12738463, 0.07460326, 0.03956191, 0.01887164]) - - >>> kde.eval_grid(x) - array([ 0.17252055, 0.41014271, 0.61349072, 0.57023834, 0.37198073, - 0.21409279, 0.12738463, 0.07460326, 0.03956191, 0.01887164]) - - >>> kde0 = wk.KDE(data, hs=0.5, alpha=0.0) - >>> kde0.eval_points(x) - array([ 0.2039735 , 0.40252503, 0.54595078, 0.52219649, 0.3906213 , - 0.26381501, 0.16407362, 0.08270612, 0.02991145, 0.00720821]) - - >>> kde0.eval_grid(x) - array([ 0.2039735 , 0.40252503, 0.54595078, 0.52219649, 0.3906213 , - 0.26381501, 0.16407362, 0.08270612, 0.02991145, 0.00720821]) - >>> f = kde0.eval_grid(x, output='plotobj') - >>> f.data - array([ 0.2039735 , 0.40252503, 0.54595078, 0.52219649, 0.3906213 , - 0.26381501, 0.16407362, 0.08270612, 0.02991145, 0.00720821]) - - >>> f = kde0.eval_grid_fast() - >>> np.allclose(np.interp(x, kde0.args[0], f), - ... [ 0.20397743, 0.40252228, 0.54594119, 0.52219025, 0.39062189, - ... 0.2638171 , 0.16407487, 0.08270755, 0.04784434, 0.04784434]) - True - >>> f1 = kde0.eval_grid_fast(output='plot') - >>> np.allclose(np.interp(x, f1.args, f1.data), - ... [ 0.20397743, 0.40252228, 0.54594119, 0.52219025, 0.39062189, - ... 0.2638171 , 0.16407487, 0.08270755, 0.04784434, 0.04784434]) - True - - h = f1.plot() - import pylab as plb - h1 = plb.plot(x, f) # 1D probability density plot - t = np.trapz(f, x) - """ - - def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None, - xmax=None, inc=512): - super(KDE, self).__init__(data, hs, kernel, alpha, xmin, xmax, inc) - - def _initialize(self): - self._compute_smoothing() - self._lambda = np.ones(self.n) - if self.alpha > 0: - # pilt = KDE(self.dataset, hs=self.hs, kernel=self.kernel, alpha=0) - # f = pilt.eval_points(self.dataset) # get a pilot estimate by - # regular KDE (alpha=0) - f = self.eval_points(self.dataset) # pilot estimate - g = np.exp(np.mean(np.log(f))) - self._lambda = (f / g) ** (-self.alpha) - - if self.inc is None: - unused_tau, tau = self.kernel.effective_support() - xyzrange = 8 * self._sigma - L1 = 10 - self.inc = 2 ** nextpow2( - max(48, (L1 * xyzrange / (tau * self.hs)).max())) - pass - - def _compute_smoothing(self): - """Computes the smoothing matrix.""" - get_smoothing = self.kernel.get_smoothing - h = self.hs - if h is None: - h = get_smoothing(self.dataset) - h = np.atleast_1d(h) - hsiz = h.shape - - if (len(hsiz) == 1) or (self.d == 1): - if max(hsiz) == 1: - h = h * np.ones(self.d) - else: - h.shape = (self.d,) # make sure it has the correct dimension - - # If h negative calculate automatic values - ind, = np.where(h <= 0) - for i in ind.tolist(): - h[i] = get_smoothing(self.dataset[i]) - deth = h.prod() - self.inv_hs = np.diag(1.0 / h) - else: # fully general smoothing matrix - deth = linalg.det(h) - if deth <= 0: - raise ValueError( - 'bandwidth matrix h must be positive definit!') - self.inv_hs = linalg.inv(h) - self.hs = h - self._norm_factor = deth * self.n - - def _eval_grid_fast(self, *args, **kwds): - X = np.vstack(args) - d, inc = X.shape - dx = X[:, 1] - X[:, 0] - - Xn = [] - nfft0 = 2 * inc - nfft = (nfft0,) * d - x0 = np.linspace(-inc, inc, nfft0 + 1) - for i in range(d): - Xn.append(x0[:-1] * dx[i]) - - Xnc = meshgrid(*Xn) if d > 1 else Xn - - shape0 = Xnc[0].shape - for i in range(d): - Xnc[i].shape = (-1,) - - Xn = np.dot(self.inv_hs, np.vstack(Xnc)) - - # Obtain the kernel weights. - kw = self.kernel(Xn) - - # plt.plot(kw) - # plt.draw() - # plt.show() - norm_fact0 = (kw.sum() * dx.prod() * self.n) - norm_fact = (self._norm_factor * self.kernel.norm_factor(d, self.n)) - if np.abs(norm_fact0 - norm_fact) > 0.05 * norm_fact: - warnings.warn( - 'Numerical inaccuracy due to too low discretization. ' + - 'Increase the discretization of the evaluation grid ' + - '(inc=%d)!' % inc) - norm_fact = norm_fact0 - - kw = kw / norm_fact - r = kwds.get('r', 0) - if r != 0: - kw *= np.vstack(Xnc) ** r if d > 1 else Xnc[0] - kw.shape = shape0 - kw = np.fft.ifftshift(kw) - fftn = np.fft.fftn - ifftn = np.fft.ifftn - - y = kwds.get('y', 1.0) - # if self.alpha>0: - # y = y / self._lambda**d - - # Find the binned kernel weights, c. - c = gridcount(self.dataset, X, y=y) - # Perform the convolution. - z = np.real(ifftn(fftn(c, s=nfft) * fftn(kw))) - - ix = (slice(0, inc),) * d - if r == 0: - return z[ix] * (z[ix] > 0.0) - else: - return z[ix] - - def _eval_grid(self, *args, **kwds): - - grd = meshgrid(*args) if len(args) > 1 else list(args) - shape0 = grd[0].shape - d = len(grd) - for i in range(d): - grd[i] = grd[i].ravel() - f = self.eval_points(np.vstack(grd), **kwds) - return f.reshape(shape0) - - def _eval_points(self, points, **kwds): - """Evaluate the estimated pdf on a set of points. - - Parameters - ---------- - points : (# of dimensions, # of points)-array - Alternatively, a (# of dimensions,) vector can be passed in and - treated as a single point. - - Returns - ------- - values : (# of points,)-array - The values at each point. - - Raises - ------ - ValueError if the dimensionality of the input points is different than - the dimensionality of the KDE. - - """ - d, m = points.shape - - result = np.zeros((m,)) - - r = kwds.get('r', 0) - if r == 0: - def fun(xi): - return 1 - else: - def fun(xi): - return (xi ** r).sum(axis=0) - - if m >= self.n: - y = kwds.get('y', np.ones(self.n)) - # there are more points than data, so loop over data - for i in range(self.n): - diff = self.dataset[:, i, np.newaxis] - points - tdiff = np.dot(self.inv_hs / self._lambda[i], diff) - result += y[i] * \ - fun(diff) * self.kernel(tdiff) / self._lambda[i] ** d - else: - y = kwds.get('y', 1) - # loop over points - for i in range(m): - diff = self.dataset - points[:, i, np.newaxis] - tdiff = np.dot(self.inv_hs, diff / self._lambda[np.newaxis, :]) - tmp = y * fun(diff) * self.kernel(tdiff) / self._lambda ** d - result[i] = tmp.sum(axis=-1) - - result /= (self._norm_factor * self.kernel.norm_factor(d, self.n)) - - return result - - -class KRegression(_KDE): - - """ Kernel-Regression - - Parameters - ---------- - data : (# of dims, # of data)-array - datapoints to estimate from - y : # of data - array - response variable - p : scalar integer (0 or 1) - Nadaraya-Watson estimator if p=0, - local linear estimator if p=1. - hs : array-like (optional) - smooting parameter vector/matrix. - (default compute from data using kernel.get_smoothing function) - kernel : kernel function object. - kernel must have get_smoothing method - alpha : real scalar (optional) - sensitivity parameter (default 0 regular KDE) - A good choice might be alpha = 0.5 ( or 1/D) - alpha = 0 Regular KDE (hs is constant) - 0 < alpha <= 1 Adaptive KDE (Make hs change) - xmin, xmax : vectors - specifying the default argument range for the kde.eval_grid methods. - For the kde.eval_grid_fast methods the values must cover the range of - the data. (default min(data)-range(data)/4, max(data)-range(data)/4) - If a single value of xmin or xmax is given then the boundary is the is - the same for all dimensions. - inc : scalar integer (default 128) - defining the default dimension of the output from kde.eval_grid methods - (For kde.eval_grid_fast: A value below 50 is very fast to compute but - may give some inaccuracies. Values between 100 and 500 give very - accurate results) - - Members - ------- - d : int - number of dimensions - n : int - number of datapoints - - Methods - ------- - kde.eval_grid_fast(x0, x1,..., xd) : array - evaluate the estimated pdf on meshgrid(x0, x1,..., xd) - kde.eval_grid(x0, x1,..., xd) : array - evaluate the estimated pdf on meshgrid(x0, x1,..., xd) - kde.eval_points(points) : array - evaluate the estimated pdf on a provided set of points - kde(x0, x1,..., xd) : array - same as kde.eval_grid(x0, x1,..., xd) - - - Example - ------- - >>> import wafo.kdetools as wk - >>> N = 100 - >>> x = np.linspace(0, 1, N) - >>> ei = np.random.normal(loc=0, scale=0.075, size=(N,)) - >>> ei = np.sqrt(0.075) * np.sin(100*x) - - >>> y = 2*np.exp(-x**2/(2*0.3**2))+3*np.exp(-(x-1)**2/(2*0.7**2)) + ei - >>> kreg = wk.KRegression(x, y) - >>> f = kreg(output='plotobj', title='Kernel regression', plotflag=1) - >>> np.allclose(f.data[:5], - ... [ 3.18381052, 3.18362269, 3.18343648, 3.1832536 , 3.1830757 ]) - True - - h = f.plot(label='p=0') - """ - - def __init__(self, data, y, p=0, hs=None, kernel=None, alpha=0.0, - xmin=None, xmax=None, inc=128, L2=None): - - self.tkde = TKDE(data, hs=hs, kernel=kernel, - alpha=alpha, xmin=xmin, xmax=xmax, inc=inc, L2=L2) - self.y = y - self.p = p - - def eval_grid_fast(self, *args, **kwds): - self._grdfun = self.tkde.eval_grid_fast - return self.tkde._eval_grid_fun(self._eval_gridfun, *args, **kwds) - - def eval_grid(self, *args, **kwds): - self._grdfun = self.tkde.eval_grid - return self.tkde._eval_grid_fun(self._eval_gridfun, *args, **kwds) - - def _eval_gridfun(self, *args, **kwds): - grdfun = self._grdfun - s0 = grdfun(*args, r=0) - t0 = grdfun(*args, r=0, y=self.y) - if self.p == 0: - return (t0 / (s0 + _TINY)).clip(min=-_REALMAX, max=_REALMAX) - elif self.p == 1: - s1 = grdfun(*args, r=1) - s2 = grdfun(*args, r=2) - t1 = grdfun(*args, r=1, y=self.y) - return ((s2 * t0 - s1 * t1) / - (s2 * s0 - s1 ** 2)).clip(min=-_REALMAX, max=_REALMAX) - __call__ = eval_grid_fast - - -class BKRegression(object): - - ''' - Kernel-Regression on binomial data - - method : {'beta', 'wilson'} - method is one of the following - 'beta', return Bayesian Credible interval using beta-distribution. - 'wilson', return Wilson score interval - a, b : scalars - parameters of the beta distribution defining the apriori distribution - of p, i.e., the Bayes estimator for p: p = (y+a)/(n+a+b). - Setting a=b=0.5 gives Jeffreys interval. - ''' - - def __init__(self, *args, **kwds): - self.method = kwds.pop('method', 'beta') - self.a = max(kwds.pop('a', 0.5), _TINY) - self.b = max(kwds.pop('b', 0.5), _TINY) - self.kreg = KRegression(*args, **kwds) - # defines bin width (i.e. smoothing) in empirical estimate - self.hs_e = None -# self.x = self.kreg.tkde.dataset -# self.y = self.kreg.y - - def _set_smoothing(self, hs): - self.kreg.tkde.hs = hs - self.kreg.tkde.initialize() - - x = property(fget=lambda cls: cls.kreg.tkde.dataset.squeeze()) - y = property(fget=lambda cls: cls.kreg.y) - kernel = property(fget=lambda cls: cls.kreg.tkde.kernel) - hs = property(fset=_set_smoothing, fget=lambda cls: cls.kreg.tkde.hs) - - def _get_max_smoothing(self, fun=None): - """Return maximum value for smoothing parameter.""" - x = self.x - y = self.y - if fun is None: - get_smoothing = self.kernel.get_smoothing - else: - get_smoothing = getattr(self.kernel, fun) - - hs1 = get_smoothing(x) - # hx = np.median(np.abs(x-np.median(x)))/0.6745*(4.0/(3*n))**0.2 - if (y == 1).any(): - hs2 = get_smoothing(x[y == 1]) - # hy = np.median(np.abs(y-np.mean(y)))/0.6745*(4.0/(3*n))**0.2 - else: - hs2 = 4 * hs1 - # hy = 4*hx - - hopt = sqrt(hs1 * hs2) - return hopt, hs1, hs2 - - def get_grid(self, hs_e=None): - if hs_e is None: - if self.hs_e is None: - hs1 = self._get_max_smoothing('hste')[0] - hs2 = self._get_max_smoothing('hos')[0] - self.hs_e = sqrt(hs1 * hs2) - hs_e = self.hs_e - x = self.x - xmin, xmax = x.min(), x.max() - ni = max(2 * int((xmax - xmin) / hs_e) + 3, 5) - sml = hs_e # *0.1 - xi = np.linspace(xmin - sml, xmax + sml, ni) - return xi - - def prb_ci(self, n, p, alpha=0.05, **kwds): - """Return Confidence Interval for the binomial probability p. - - Parameters - ---------- - n : array-like - number of Bernoulli trials - p : array-like - estimated probability of success in each trial - alpha : scalar - confidence level - method : {'beta', 'wilson'} - method is one of the following - 'beta', return Bayesian Credible interval using beta-distribution. - 'wilson', return Wilson score interval - a, b : scalars - parameters of the beta distribution defining the apriori - distribution of p, i.e., - the Bayes estimator for p: p = (y+a)/(n+a+b). - Setting a=b=0.5 gives Jeffreys interval. - - """ - if self.method.startswith('w'): - # Wilson score - z0 = -_invnorm(alpha / 2) - den = 1 + (z0 ** 2. / n) - xc = (p + (z0 ** 2) / (2 * n)) / den - halfwidth = (z0 * sqrt((p * (1 - p) / n) + - (z0 ** 2 / (4 * (n ** 2))))) / den - plo = (xc - halfwidth).clip(min=0) # wilson score - pup = (xc + halfwidth).clip(max=1.0) # wilson score - else: - # Jeffreys intervall a=b=0.5 - # st.beta.isf(alpha/2, y+a, n-y+b) y = n*p, n-y = n*(1-p) - a = self.a - b = self.b - st = stats - pup = np.where( - p == 1, 1, st.beta.isf(alpha / 2, n * p + a, n * (1 - p) + b)) - plo = np.where(p == 0, 0, - st.beta.isf(1 - alpha / 2, - n * p + a, n * (1 - p) + b)) - return plo, pup - - def prb_empirical(self, xi=None, hs_e=None, alpha=0.05, color='r', **kwds): - """Returns empirical binomial probabiltity. - - Parameters - ---------- - x : ndarray - position vector - y : ndarray - binomial response variable (zeros and ones) - alpha : scalar - confidence level - color: - used in plot - - Returns - ------- - P(x) : PlotData object - empirical probability - - """ - if xi is None: - xi = self.get_grid(hs_e) - - x = self.x - y = self.y - - c = gridcount(x, xi) # + self.a + self.b # count data - if (y == 1).any(): - c0 = gridcount(x[y == 1], xi) # + self.a # count success - else: - c0 = np.zeros(xi.shape) - prb = np.where(c == 0, 0, c0 / (c + _TINY)) # assume prb==0 for c==0 - CI = np.vstack(self.prb_ci(c, prb, alpha, **kwds)) - - prb_e = PlotData(prb, xi, plotmethod='plot', plot_args=['.'], - plot_kwds=dict(markersize=6, color=color, picker=5)) - prb_e.dataCI = CI.T - prb_e.count = c - return prb_e - - def prb_smoothed(self, prb_e, hs, alpha=0.05, color='r', label=''): - """Return smoothed binomial probability. - - Parameters - ---------- - prb_e : PlotData object with empirical binomial probabilites - hs : smoothing parameter - alpha : confidence level - color : color of plot object - label : label for plot object - - """ - - x_e = prb_e.args - n_e = len(x_e) - dx_e = x_e[1] - x_e[0] - n = self.x.size - - x_s = np.linspace(x_e[0], x_e[-1], 10 * n_e + 1) - self.hs = hs - - prb_s = self.kreg(x_s, output='plotobj', title='', plot_kwds=dict( - color=color, linewidth=2)) # dict(plotflag=7)) - m_nan = np.isnan(prb_s.data) - if m_nan.any(): # assume 0/0 division - prb_s.data[m_nan] = 0.0 - - # prb_s.data[np.isnan(prb_s.data)] = 0 - # expected number of data in each bin - c_s = self.kreg.tkde.eval_grid_fast(x_s) * dx_e * n - plo, pup = self.prb_ci(c_s, prb_s.data, alpha) - - prb_s.dataCI = np.vstack((plo, pup)).T - prb_s.prediction_error_avg = np.trapz( - pup - plo, x_s) / (x_s[-1] - x_s[0]) - - if label: - prb_s.plot_kwds['label'] = label - prb_s.children = [PlotData([plo, pup], x_s, - plotmethod='fill_between', - plot_kwds=dict(alpha=0.2, color=color)), - prb_e] - - # empirical oversmooths the data -# p_s = prb_s.eval_points(self.x) -# dp_s = np.diff(prb_s.data) -# k = (dp_s[:-1]*dp_s[1:]<0).sum() # numpeaks -# p_e = self.y -# n_s = interpolate.interp1d(x_s, c_s)(self.x) -# plo, pup = self.prb_ci(n_s, p_s, alpha) -# sigmai = (pup-plo) -# aicc = (((p_e-p_s)/sigmai)**2).sum()+ 2*k*(k+1)/np.maximum(n-k+1,1) - - p_e = prb_e.eval_points(x_s) - p_s = prb_s.data - dp_s = np.sign(np.diff(p_s)) - k = (dp_s[:-1] != dp_s[1:]).sum() # numpeaks - - # sigmai = (pup-plo)+_EPS - # aicc = (((p_e-p_s)/sigmai)**2).sum()+ 2*k*(k+1)/np.maximum(n_e-k+1,1) - # + np.abs((p_e-pup).clip(min=0)-(p_e-plo).clip(max=0)).sum() - sigmai = _logit(pup) - _logit(plo) + _EPS - aicc = ((((_logit(p_e) - _logit(p_s)) / sigmai) ** 2).sum() + - 2 * k * (k + 1) / np.maximum(n_e - k + 1, 1) + - np.abs((p_e - pup).clip(min=0) - - (p_e - plo).clip(max=0)).sum()) - - prb_s.aicc = aicc - # prb_s.labels.title = '' - # prb_s.labels.title='perr=%1.3f,aicc=%1.3f, n=%d, hs=%1.3f' % - # (prb_s.prediction_error_avg,aicc,n,hs) - - return prb_s - - def prb_search_best(self, prb_e=None, hsvec=None, hsfun='hste', - alpha=0.05, color='r', label=''): - """Return best smoothed binomial probability. - - Parameters - ---------- - prb_e : PlotData object with empirical binomial probabilites - hsvec : arraylike (default np.linspace(hsmax*0.1,hsmax,55)) - vector smoothing parameters - hsfun : - method for calculating hsmax - - """ - if prb_e is None: - prb_e = self.prb_empirical( - hs_e=self.hs_e, alpha=alpha, color=color) - if hsvec is None: - hsmax = self._get_max_smoothing(hsfun)[0] # @UnusedVariable - hsmax = max(hsmax, self.hs_e) - hsvec = np.linspace(hsmax * 0.2, hsmax, 55) - - hs_best = hsvec[-1] + 0.1 - prb_best = self.prb_smoothed(prb_e, hs_best, alpha, color, label) - aicc = np.zeros(np.size(hsvec)) - for i, hi in enumerate(hsvec): - f = self.prb_smoothed(prb_e, hi, alpha, color, label) - aicc[i] = f.aicc - if f.aicc <= prb_best.aicc: - prb_best = f - hs_best = hi - prb_best.score = PlotData(aicc, hsvec) - prb_best.hs = hs_best - self._set_smoothing(hs_best) - return prb_best - - -class _Kernel(object): - - def __init__(self, r=1.0, stats=None): - self.r = r # radius of kernel - self.stats = stats - - def norm_factor(self, d=1, n=None): - return 1.0 - - def norm_kernel(self, x): - X = np.atleast_2d(x) - return self._kernel(X) / self.norm_factor(*X.shape) - - def kernel(self, x): - return self._kernel(np.atleast_2d(x)) - - def deriv4_6_8_10(self, t, numout=4): - raise Exception('Method not implemented for this kernel!') - - def effective_support(self): - """Return the effective support of kernel. - - The kernel must be symmetric and compactly supported on [-tau tau] - if the kernel has infinite support then the kernel must have the - effective support in [-tau tau], i.e., be negligible outside the range - - """ - return self._effective_support() - - def _effective_support(self): - return - self.r, self.r - __call__ = kernel - - -class _KernelMulti(_Kernel): - # p=0; %Sphere = rect for 1D - # p=1; %Multivariate Epanechnikov kernel. - # p=2; %Multivariate Bi-weight Kernel - # p=3; %Multi variate Tri-weight Kernel - # p=4; %Multi variate Four-weight Kernel - - def __init__(self, r=1.0, p=1, stats=None): - self.r = r - self.p = p - self.stats = stats - - def norm_factor(self, d=1, n=None): - r = self.r - p = self.p - c = 2 ** p * np.prod(np.r_[1:p + 1]) * sphere_volume(d, r) / np.prod( - np.r_[(d + 2):(2 * p + d + 1):2]) # normalizing constant - return c - - def _kernel(self, x): - r = self.r - p = self.p - x2 = x ** 2 - return ((1.0 - x2.sum(axis=0) / r ** 2).clip(min=0.0)) ** p - -mkernel_epanechnikov = _KernelMulti(p=1, stats=_stats_epan) -mkernel_biweight = _KernelMulti(p=2, stats=_stats_biwe) -mkernel_triweight = _KernelMulti(p=3, stats=_stats_triw) - - -class _KernelProduct(_KernelMulti): - # p=0; %rectangular - # p=1; %1D product Epanechnikov kernel. - # p=2; %1D product Bi-weight Kernel - # p=3; %1D product Tri-weight Kernel - # p=4; %1D product Four-weight Kernel - - def norm_factor(self, d=1, n=None): - r = self.r - p = self.p - c = (2 ** p * np.prod(np.r_[1:p + 1]) * sphere_volume(1, r) / - np.prod(np.r_[(1 + 2):(2 * p + 2):2])) - return c ** d - - def _kernel(self, x): - r = self.r # radius - pdf = (1 - (x / r) ** 2).clip(min=0.0) - return pdf.prod(axis=0) - -mkernel_p1epanechnikov = _KernelProduct(p=1, stats=_stats_epan) -mkernel_p1biweight = _KernelProduct(p=2, stats=_stats_biwe) -mkernel_p1triweight = _KernelProduct(p=3, stats=_stats_triw) - - -class _KernelRectangular(_Kernel): - - def _kernel(self, x): - return np.where(np.all(np.abs(x) <= self.r, axis=0), 1, 0.0) - - def norm_factor(self, d=1, n=None): - r = self.r - return (2 * r) ** d -mkernel_rectangular = _KernelRectangular(stats=_stats_rect) - - -class _KernelTriangular(_Kernel): - - def _kernel(self, x): - pdf = (1 - np.abs(x)).clip(min=0.0) - return pdf.prod(axis=0) -mkernel_triangular = _KernelTriangular(stats=_stats_tria) - - -class _KernelGaussian(_Kernel): - - def _kernel(self, x): - sigma = self.r / 4.0 - x2 = (x / sigma) ** 2 - return exp(-0.5 * x2.sum(axis=0)) - - def norm_factor(self, d=1, n=None): - sigma = self.r / 4.0 - return (2 * pi * sigma) ** (d / 2.0) - - def deriv4_6_8_10(self, t, numout=4): - """Returns 4th, 6th, 8th and 10th derivatives of the kernel - function.""" - phi0 = exp(-0.5 * t ** 2) / sqrt(2 * pi) - p4 = [1, 0, -6, 0, +3] - p4val = np.polyval(p4, t) * phi0 - if numout == 1: - return p4val - out = [p4val] - pn = p4 - for unusedix in range(numout - 1): - pnp1 = np.polyadd(-np.r_[pn, 0], np.polyder(pn)) - pnp2 = np.polyadd(-np.r_[pnp1, 0], np.polyder(pnp1)) - out.append(np.polyval(pnp2, t) * phi0) - pn = pnp2 - return out - -mkernel_gaussian = _KernelGaussian(r=4.0, stats=_stats_gaus) - -# def mkernel_gaussian(X): -# x2 = X ** 2 -# d = X.shape[0] -# return (2 * pi) ** (-d / 2) * exp(-0.5 * x2.sum(axis=0)) - - -class _KernelLaplace(_Kernel): - - def _kernel(self, x): - absX = np.abs(x) - return exp(-absX.sum(axis=0)) - - def norm_factor(self, d=1, n=None): - return 2 ** d -mkernel_laplace = _KernelLaplace(r=7.0, stats=_stats_lapl) - - -class _KernelLogistic(_Kernel): - - def _kernel(self, x): - s = exp(-x) - return np.prod(1.0 / (s + 1) ** 2, axis=0) -mkernel_logistic = _KernelLogistic(r=7.0, stats=_stats_logi) - -_MKERNEL_DICT = dict( - epan=mkernel_epanechnikov, - biwe=mkernel_biweight, - triw=mkernel_triweight, - p1ep=mkernel_p1epanechnikov, - p1bi=mkernel_p1biweight, - p1tr=mkernel_p1triweight, - rect=mkernel_rectangular, - tria=mkernel_triangular, - lapl=mkernel_laplace, - logi=mkernel_logistic, - gaus=mkernel_gaussian -) -_KERNEL_EXPONENT_DICT = dict( - re=0, sp=0, ep=1, bi=2, tr=3, fo=4, fi=5, si=6, se=7) - - -class Kernel(object): - - """Multivariate kernel. - - Parameters - ---------- - name : string - defining the kernel. Valid options are: - 'epanechnikov' - Epanechnikov kernel. - 'biweight' - Bi-weight kernel. - 'triweight' - Tri-weight kernel. - 'p1epanechnikov' - product of 1D Epanechnikov kernel. - 'p1biweight' - product of 1D Bi-weight kernel. - 'p1triweight' - product of 1D Tri-weight kernel. - 'triangular' - Triangular kernel. - 'gaussian' - Gaussian kernel - 'rectangular' - Rectangular kernel. - 'laplace' - Laplace kernel. - 'logistic' - Logistic kernel. - Note that only the first 4 letters of the kernel name is needed. - - Examples - -------- - N = 20 - data = np.random.rayleigh(1, size=(N,)) - >>> data = np.array([ - ... 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887, - ... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, - ... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, - ... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452]) - - >>> import wafo.kdetools as wk - >>> gauss = wk.Kernel('gaussian') - >>> gauss.stats() - (1, 0.28209479177387814, 0.21157109383040862) - >>> np.allclose(gauss.hscv(data), 0.21779575) - True - >>> np.allclose(gauss.hstt(data), 0.16341135) - True - >>> np.allclose(gauss.hste(data), 0.19179399) - True - >>> np.allclose(gauss.hldpi(data), 0.22502733) - True - >>> wk.Kernel('laplace').stats() - (2, 0.25, inf) - - >>> triweight = wk.Kernel('triweight') - >>> np.allclose(triweight.stats(), - ... (0.1111111111111111, 0.81585081585081587, np.inf)) - True - >>> np.allclose(triweight(np.linspace(-1,1,11)), - ... [ 0., 0.046656, 0.262144, 0.592704, 0.884736, 1., - ... 0.884736, 0.592704, 0.262144, 0.046656, 0.]) - True - >>> np.allclose(triweight.hns(data), 0.82, rtol=1e-2) - True - >>> np.allclose(triweight.hos(data), 0.88, rtol=1e-2) - True - >>> np.allclose(triweight.hste(data), 0.57, rtol=1e-2) - True - >>> np.allclose(triweight.hscv(data), 0.648, rtol=1e-2) - True - - See also - -------- - mkernel - - References - ---------- - B. W. Silverman (1986) - 'Density estimation for statistics and data analysis' - Chapman and Hall, pp. 43, 76 - - Wand, M. P. and Jones, M. C. (1995) - 'Density estimation for statistics and data analysis' - Chapman and Hall, pp 31, 103, 175 - - """ - - def __init__(self, name, fun='hste'): # 'hns'): - self.kernel = _MKERNEL_DICT[name[:4]] - # self.name = self.kernel.__name__.replace('mkernel_', '').title() - try: - self.get_smoothing = getattr(self, fun) - except: - self.get_smoothing = self.hste - - def _get_name(self): - return self.kernel.__class__.__name__.replace('_Kernel', '').title() - name = property(_get_name) - - def get_smoothing(self, *args, **kwds): - pass - - def stats(self): - """Return some 1D statistics of the kernel. - - Returns - ------- - mu2 : real scalar - 2'nd order moment, i.e.,int(x^2*kernel(x)) - R : real scalar - integral of squared kernel, i.e., int(kernel(x)^2) - Rdd : real scalar - integral of squared double derivative of kernel, - i.e., int( (kernel''(x))^2 ). - - Reference - --------- - Wand,M.P. and Jones, M.C. (1995) - 'Kernel smoothing' - Chapman and Hall, pp 176. - - """ - return self.kernel.stats - - def deriv4_6_8_10(self, t, numout=4): - return self.kernel.deriv4_6_8_10(t, numout) - - def effective_support(self): - return self.kernel.effective_support() - - def hns(self, data): - """Returns Normal Scale Estimate of Smoothing Parameter. - - Parameter - --------- - data : 2D array - shape d x n (d = # dimensions ) - - Returns - ------- - h : array-like - one dimensional optimal value for smoothing parameter - given the data and kernel. size D - - HNS only gives an optimal value with respect to mean integrated - square error, when the true underlying distribution - is Gaussian. This works reasonably well if the data resembles a - Gaussian distribution. However if the distribution is asymmetric, - multimodal or have long tails then HNS may return a to large - smoothing parameter, i.e., the KDE may be oversmoothed and mask - important features of the data. (=> large bias). - One way to remedy this is to reduce H by multiplying with a constant - factor, e.g., 0.85. Another is to try different values for H and make a - visual check by eye. - - Example: - data = rndnorm(0, 1,20,1) - h = hns(data,'epan') - - See also: - --------- - hste, hbcv, hboot, hos, hldpi, hlscv, hscv, hstt, kde - - Reference: - --------- - B. W. Silverman (1986) - 'Density estimation for statistics and data analysis' - Chapman and Hall, pp 43-48 - Wand,M.P. and Jones, M.C. (1995) - 'Kernel smoothing' - Chapman and Hall, pp 60--63 - - """ - - A = np.atleast_2d(data) - n = A.shape[1] - - # R= int(mkernel(x)^2), mu2= int(x^2*mkernel(x)) - mu2, R, unusedRdd = self.stats() - AMISEconstant = (8 * sqrt(pi) * R / (3 * mu2 ** 2 * n)) ** (1. / 5) - iqr = iqrange(A, axis=1) # interquartile range - stdA = np.std(A, axis=1, ddof=1) - # use of interquartile range guards against outliers. - # the use of interquartile range is better if - # the distribution is skew or have heavy tails - # This lessen the chance of oversmoothing. - return np.where(iqr > 0, - np.minimum(stdA, iqr / 1.349), stdA) * AMISEconstant - - def hos(self, data): - """Returns Oversmoothing Parameter. - - Parameter - --------- - data = data matrix, size N x D (D = # dimensions ) - - Returns - ------- - h : vector size 1 x D - one dimensional maximum smoothing value for smoothing parameter - given the data and kernel. - - The oversmoothing or maximal smoothing principle relies on the fact - that there is a simple upper bound for the AMISE-optimal bandwidth for - estimation of densities with a fixed value of a particular scale - measure. While HOS will give too large bandwidth for optimal estimation - of a general density it provides an excellent starting point for - subjective choice of bandwidth. A sensible strategy is to plot an - estimate with bandwidth HOS and then sucessively look at plots based on - convenient fractions of HOS to see what features are present in the - data for various amount of smoothing. The relation to HNS is given by: - - HOS = HNS/0.93 - - Example: - -------- - data = rndnorm(0, 1,20,1) - h = hos(data,'epan'); - - See also hste, hbcv, hboot, hldpi, hlscv, hscv, hstt, kde, kdefun - - Reference - --------- - B. W. Silverman (1986) - 'Density estimation for statistics and data analysis' - Chapman and Hall, pp 43-48 - - Wand,M.P. and Jones, M.C. (1986) - 'Kernel smoothing' - Chapman and Hall, pp 60--63 - - """ - return self.hns(data) / 0.93 - - def hmns(self, data): - """Returns Multivariate Normal Scale Estimate of Smoothing Parameter. - - CALL: h = hmns(data,kernel) - - h = M dimensional optimal value for smoothing parameter - given the data and kernel. size D x D - data = data matrix, size D x N (D = # dimensions ) - kernel = 'epanechnikov' - Epanechnikov kernel. - 'biweight' - Bi-weight kernel. - 'triweight' - Tri-weight kernel. - 'gaussian' - Gaussian kernel - - Note that only the first 4 letters of the kernel name is needed. - - HMNS only gives a optimal value with respect to mean integrated - square error, when the true underlying distribution is Multivariate - Gaussian. This works reasonably well if the data resembles a - Multivariate Gaussian distribution. However if the distribution is - asymmetric, multimodal or have long tails then HNS is maybe more - appropriate. - - Example: - data = rndnorm(0, 1,20,2) - h = hmns(data,'epan') - - See also - -------- - - hns, hste, hbcv, hboot, hos, hldpi, hlscv, hscv, hstt - - Reference - ---------- - B. W. Silverman (1986) - 'Density estimation for statistics and data analysis' - Chapman and Hall, pp 43-48, 87 - - Wand,M.P. and Jones, M.C. (1995) - 'Kernel smoothing' - Chapman and Hall, pp 60--63, 86--88 - - """ - # TODO: implement more kernels - - A = np.atleast_2d(data) - d, n = A.shape - - if d == 1: - return self.hns(data) - name = self.name[:4].lower() - if name == 'epan': # Epanechnikov kernel - a = (8.0 * (d + 4.0) * (2 * sqrt(pi)) ** d / - sphere_volume(d)) ** (1. / (4.0 + d)) - elif name == 'biwe': # Bi-weight kernel - a = 2.7779 - if d > 2: - raise ValueError('not implemented for d>2') - elif name == 'triw': # Triweight - a = 3.12 - if d > 2: - raise ValueError('not implemented for d>2') - elif name == 'gaus': # Gaussian kernel - a = (4.0 / (d + 2.0)) ** (1. / (d + 4.0)) - else: - raise ValueError('Unknown kernel.') - - covA = scipy.cov(A) - - return a * linalg.sqrtm(covA).real * n ** (-1. / (d + 4)) - - def hste(self, data, h0=None, inc=128, maxit=100, releps=0.01, abseps=0.0): - '''HSTE 2-Stage Solve the Equation estimate of smoothing parameter. - - CALL: hs = hste(data,kernel,h0) - - hs = one dimensional value for smoothing parameter - given the data and kernel. size 1 x D - data = data matrix, size N x D (D = # dimensions ) - kernel = 'gaussian' - Gaussian kernel (default) - ( currently the only supported kernel) - h0 = initial starting guess for hs (default h0=hns(A,kernel)) - - Example: - x = rndnorm(0,1,50,1); - hs = hste(x,'gauss'); - - See also hbcv, hboot, hos, hldpi, hlscv, hscv, hstt, kde, kdefun - - Reference - --------- - B. W. Silverman (1986) - 'Density estimation for statistics and data analysis' - Chapman and Hall, pp 57--61 - - Wand,M.P. and Jones, M.C. (1986) - 'Kernel smoothing' - Chapman and Hall, pp 74--75 - ''' - # TODO: NB: this routine can be made faster: - # TODO: replace the iteration in the end with a Newton Raphson scheme - - A = np.atleast_2d(data) - d, n = A.shape - - # R= int(mkernel(x)^2), mu2= int(x^2*mkernel(x)) - mu2, R, unusedRdd = self.stats() - - AMISEconstant = (8 * sqrt(pi) * R / (3 * mu2 ** 2 * n)) ** (1. / 5) - STEconstant = R / (mu2 ** (2) * n) - - sigmaA = self.hns(A) / AMISEconstant - if h0 is None: - h0 = sigmaA * AMISEconstant - - h = np.asarray(h0, dtype=float) - - nfft = inc * 2 - amin = A.min(axis=1) # Find the minimum value of A. - amax = A.max(axis=1) # Find the maximum value of A. - arange = amax - amin # Find the range of A. - - # xa holds the x 'axis' vector, defining a grid of x values where - # the k.d. function will be evaluated. - - ax1 = amin - arange / 8.0 - bx1 = amax + arange / 8.0 - - kernel2 = Kernel('gauss') - mu2, R, unusedRdd = kernel2.stats() - STEconstant2 = R / (mu2 ** (2) * n) - fft = np.fft.fft - ifft = np.fft.ifft - - for dim in range(d): - s = sigmaA[dim] - ax = ax1[dim] - bx = bx1[dim] - - xa = np.linspace(ax, bx, inc) - xn = np.linspace(0, bx - ax, inc) - - c = gridcount(A[dim], xa) - - # Step 1 - psi6NS = -15 / (16 * sqrt(pi) * s ** 7) - psi8NS = 105 / (32 * sqrt(pi) * s ** 9) - - # Step 2 - k40, k60 = kernel2.deriv4_6_8_10(0, numout=2) - g1 = (-2 * k40 / (mu2 * psi6NS * n)) ** (1.0 / 7) - g2 = (-2 * k60 / (mu2 * psi8NS * n)) ** (1.0 / 9) - - # Estimate psi6 given g2. - # kernel weights. - kw4, kw6 = kernel2.deriv4_6_8_10(xn / g2, numout=2) - # Apply fftshift to kw. - kw = np.r_[kw6, 0, kw6[-1:0:-1]] - z = np.real(ifft(fft(c, nfft) * fft(kw))) # convolution. - psi6 = np.sum(c * z[:inc]) / (n * (n - 1) * g2 ** 7) - - # Estimate psi4 given g1. - kw4 = kernel2.deriv4_6_8_10(xn / g1, numout=1) # kernel weights. - kw = np.r_[kw4, 0, kw4[-1:0:-1]] # Apply 'fftshift' to kw. - z = np.real(ifft(fft(c, nfft) * fft(kw))) # convolution. - psi4 = np.sum(c * z[:inc]) / (n * (n - 1) * g1 ** 5) - - h1 = h[dim] - h_old = 0 - count = 0 - - while ((abs(h_old - h1) > max(releps * h1, abseps)) and - (count < maxit)): - count += 1 - h_old = h1 - - # Step 3 - gamma = ((2 * k40 * mu2 * psi4 * h1 ** 5) / - (-psi6 * R)) ** (1.0 / 7) - - # Now estimate psi4 given gamma. - # kernel weights. - kw4 = kernel2.deriv4_6_8_10(xn / gamma, numout=1) - kw = np.r_[kw4, 0, kw4[-1:0:-1]] # Apply 'fftshift' to kw. - z = np.real(ifft(fft(c, nfft) * fft(kw))) # convolution. - - psi4Gamma = np.sum(c * z[:inc]) / (n * (n - 1) * gamma ** 5) - - # Step 4 - h1 = (STEconstant2 / psi4Gamma) ** (1.0 / 5) - - # Kernel other than Gaussian scale bandwidth - h1 = h1 * (STEconstant / STEconstant2) ** (1.0 / 5) - - if count >= maxit: - warnings.warn('The obtained value did not converge.') - - h[dim] = h1 - # end for dim loop - return h - - def hisj(self, data, inc=512, L=7): - ''' - HISJ Improved Sheather-Jones estimate of smoothing parameter. - - Unlike many other implementations, this one is immune to problems - caused by multimodal densities with widely separated modes. The - estimation does not deteriorate for multimodal densities, because - it do not assume a parametric model for the data. - - Parameters - ---------- - data - a vector of data from which the density estimate is constructed - inc - the number of mesh points used in the uniform discretization - - Returns - ------- - bandwidth - the optimal bandwidth - - Reference - --------- - Kernel density estimation via diffusion - Z. I. Botev, J. F. Grotowski, and D. P. Kroese (2010) - Annals of Statistics, Volume 38, Number 5, pages 2916-2957. - ''' - A = np.atleast_2d(data) - d, n = A.shape - - # R= int(mkernel(x)^2), mu2= int(x^2*mkernel(x)) - mu2, R, unusedRdd = self.stats() - STEconstant = R / (n * mu2 ** 2) - - amin = A.min(axis=1) # Find the minimum value of A. - amax = A.max(axis=1) # Find the maximum value of A. - arange = amax - amin # Find the range of A. - - # xa holds the x 'axis' vector, defining a grid of x values where - # the k.d. function will be evaluated. - - ax1 = amin - arange / 8.0 - bx1 = amax + arange / 8.0 - - kernel2 = Kernel('gauss') - mu2, R, unusedRdd = kernel2.stats() - STEconstant2 = R / (mu2 ** (2) * n) - - def fixed_point(t, N, I, a2): - ''' this implements the function t-zeta*gamma^[L](t)''' - - prod = np.prod - # L = 7 - logI = np.log(I) - f = 2 * pi ** (2 * L) * \ - (a2 * exp(L * logI - I * pi ** 2 * t)).sum() - for s in range(L - 1, 1, -1): - K0 = prod(np.r_[1:2 * s:2]) / sqrt(2 * pi) - const = (1 + (1. / 2) ** (s + 1. / 2)) / 3 - time = (2 * const * K0 / N / f) ** (2. / (3 + 2 * s)) - f = 2 * pi ** (2 * s) * \ - (a2 * exp(s * logI - I * pi ** 2 * time)).sum() - return t - (2 * N * sqrt(pi) * f) ** (-2. / 5) - - h = np.empty(d) - for dim in range(d): - ax = ax1[dim] - bx = bx1[dim] - xa = np.linspace(ax, bx, inc) - R = bx - ax - - c = gridcount(A[dim], xa) - N = len(set(A[dim])) - # a = dct(c/c.sum(), norm=None) - a = dct(c / len(A[dim]), norm=None) - - # now compute the optimal bandwidth^2 using the referenced method - I = np.asfarray(np.arange(1, inc)) ** 2 - a2 = (a[1:] / 2) ** 2 - - def fun(t): - return fixed_point(t, N, I, a2) - x = np.linspace(0, 0.1, 150) - ai = x[0] - f0 = fun(ai) - for bi in x[1:]: - f1 = fun(bi) - if f1 * f0 <= 0: - # print('ai = %g, bi = %g' % (ai,bi)) - break - else: - ai = bi - # y = np.asarray([fun(j) for j in x]) - # plt.figure(1) - # plt.plot(x,y) - # plt.show() - - # use fzero to solve the equation t=zeta*gamma^[5](t) - try: - t_star = optimize.brentq(fun, a=ai, b=bi) - except: - t_star = 0.28 * N ** (-2. / 5) - warnings.warn('Failure in obtaining smoothing parameter') - - # smooth the discrete cosine transform of initial data using t_star - # a_t = a*exp(-np.arange(inc)**2*pi**2*t_star/2) - # now apply the inverse discrete cosine transform - # density = idct(a_t)/R; - - # take the rescaling of the data into account - bandwidth = sqrt(t_star) * R - - # Kernel other than Gaussian scale bandwidth - h[dim] = bandwidth * (STEconstant / STEconstant2) ** (1.0 / 5) - # end for dim loop - return h - - def hstt(self, data, h0=None, inc=128, maxit=100, releps=0.01, abseps=0.0): - '''HSTT Scott-Tapia-Thompson estimate of smoothing parameter. - - CALL: hs = hstt(data,kernel) - - hs = one dimensional value for smoothing parameter - given the data and kernel. size 1 x D - data = data matrix, size N x D (D = # dimensions ) - kernel = 'epanechnikov' - Epanechnikov kernel. (default) - 'biweight' - Bi-weight kernel. - 'triweight' - Tri-weight kernel. - 'triangular' - Triangular kernel. - 'gaussian' - Gaussian kernel - 'rectangular' - Rectangular kernel. - 'laplace' - Laplace kernel. - 'logistic' - Logistic kernel. - - HSTT returns Scott-Tapia-Thompson (STT) estimate of smoothing - parameter. This is a Solve-The-Equation rule (STE). - Simulation studies shows that the STT estimate of HS - is a good choice under a variety of models. A comparison with - likelihood cross-validation (LCV) indicates that LCV performs slightly - better for short tailed densities. - However, STT method in contrast to LCV is insensitive to outliers. - - Example - ------- - x = rndnorm(0,1,50,1); - hs = hstt(x,'gauss'); - - See also - -------- - hste, hbcv, hboot, hos, hldpi, hlscv, hscv, kde, kdebin - - Reference - --------- - B. W. Silverman (1986) - 'Density estimation for statistics and data analysis' - Chapman and Hall, pp 57--61 - ''' - A = np.atleast_2d(data) - d, n = A.shape - - # R= int(mkernel(x)^2), mu2= int(x^2*mkernel(x)) - mu2, R, unusedRdd = self.stats() - - AMISEconstant = (8 * sqrt(pi) * R / (3 * mu2 ** 2 * n)) ** (1. / 5) - STEconstant = R / (mu2 ** (2) * n) - - sigmaA = self.hns(A) / AMISEconstant - if h0 is None: - h0 = sigmaA * AMISEconstant - - h = np.asarray(h0, dtype=float) - - nfft = inc * 2 - amin = A.min(axis=1) # Find the minimum value of A. - amax = A.max(axis=1) # Find the maximum value of A. - arange = amax - amin # Find the range of A. - - # xa holds the x 'axis' vector, defining a grid of x values where - # the k.d. function will be evaluated. - - ax1 = amin - arange / 8.0 - bx1 = amax + arange / 8.0 - - fft = np.fft.fft - ifft = np.fft.ifft - for dim in range(d): - s = sigmaA[dim] - datan = A[dim] / s - ax = ax1[dim] / s - bx = bx1[dim] / s - - xa = np.linspace(ax, bx, inc) - xn = np.linspace(0, bx - ax, inc) - - c = gridcount(datan, xa) - - count = 1 - h_old = 0 - h1 = h[dim] / s - delta = (bx - ax) / (inc - 1) - while ((abs(h_old - h1) > max(releps * h1, abseps)) and - (count < maxit)): - count += 1 - h_old = h1 - - kw4 = self.kernel(xn / h1) / (n * h1 * self.norm_factor(d=1)) - kw = np.r_[kw4, 0, kw4[-1:0:-1]] # Apply 'fftshift' to kw. - f = np.real(ifft(fft(c, nfft) * fft(kw))) # convolution. - - # Estimate psi4=R(f'') using simple finite differences and - # quadrature. - ix = np.arange(1, inc - 1) - z = ((f[ix + 1] - 2 * f[ix] + f[ix - 1]) / delta ** 2) ** 2 - psi4 = delta * z.sum() - h1 = (STEconstant / psi4) ** (1. / 5) - - if count >= maxit: - warnings.warn('The obtained value did not converge.') - - h[dim] = h1 * s - # end % for dim loop - return h - - def hscv(self, data, hvec=None, inc=128, maxit=100, fulloutput=False): - ''' - HSCV Smoothed cross-validation estimate of smoothing parameter. - - CALL: [hs,hvec,score] = hscv(data,kernel,hvec) - - hs = smoothing parameter - hvec = vector defining possible values of hs - (default linspace(0.25*h0,h0,100), h0=0.62) - score = score vector - data = data vector - kernel = 'gaussian' - Gaussian kernel the only supported - - Note that only the first 4 letters of the kernel name is needed. - - Example: - data = rndnorm(0,1,20,1) - [hs hvec score] = hscv(data,'epan'); - plot(hvec,score) - See also hste, hbcv, hboot, hos, hldpi, hlscv, hstt, kde, kdefun - - Wand,M.P. and Jones, M.C. (1986) - 'Kernel smoothing' - Chapman and Hall, pp 75--79 - ''' - # TODO: Add support for other kernels than Gaussian - A = np.atleast_2d(data) - d, n = A.shape - - # R= int(mkernel(x)^2), mu2= int(x^2*mkernel(x)) - mu2, R, unusedRdd = self.stats() - - AMISEconstant = (8 * sqrt(pi) * R / (3 * mu2 ** 2 * n)) ** (1. / 5) - STEconstant = R / (mu2 ** (2) * n) - - sigmaA = self.hns(A) / AMISEconstant - if hvec is None: - H = AMISEconstant / 0.93 - hvec = np.linspace(0.25 * H, H, maxit) - hvec = np.asarray(hvec, dtype=float) - - steps = len(hvec) - score = np.zeros(steps) - - nfft = inc * 2 - amin = A.min(axis=1) # Find the minimum value of A. - amax = A.max(axis=1) # Find the maximum value of A. - arange = amax - amin # Find the range of A. - - # xa holds the x 'axis' vector, defining a grid of x values where - # the k.d. function will be evaluated. - - ax1 = amin - arange / 8.0 - bx1 = amax + arange / 8.0 - - kernel2 = Kernel('gauss') - mu2, R, unusedRdd = kernel2.stats() - STEconstant2 = R / (mu2 ** (2) * n) - fft = np.fft.fft - ifft = np.fft.ifft - - h = np.zeros(d) - hvec = hvec * (STEconstant2 / STEconstant) ** (1. / 5.) - - k40, k60, k80, k100 = kernel2.deriv4_6_8_10(0, numout=4) - psi8 = 105 / (32 * sqrt(pi)) - psi12 = 3465. / (512 * sqrt(pi)) - g1 = (-2. * k60 / (mu2 * psi8 * n)) ** (1. / 9.) - g2 = (-2. * k100 / (mu2 * psi12 * n)) ** (1. / 13.) - - for dim in range(d): - s = sigmaA[dim] - ax = ax1[dim] / s - bx = bx1[dim] / s - datan = A[dim] / s - - xa = np.linspace(ax, bx, inc) - xn = np.linspace(0, bx - ax, inc) - - c = gridcount(datan, xa) - - kw4, kw6 = kernel2.deriv4_6_8_10(xn / g1, numout=2) - kw = np.r_[kw6, 0, kw6[-1:0:-1]] - z = np.real(ifft(fft(c, nfft) * fft(kw))) - psi6 = np.sum(c * z[:inc]) / (n ** 2 * g1 ** 7) - - kw4, kw6, kw8, kw10 = kernel2.deriv4_6_8_10(xn / g2, numout=4) - kw = np.r_[kw10, 0, kw10[-1:0:-1]] - z = np.real(ifft(fft(c, nfft) * fft(kw))) - psi10 = np.sum(c * z[:inc]) / (n ** 2 * g2 ** 11) - - g3 = (-2. * k40 / (mu2 * psi6 * n)) ** (1. / 7.) - g4 = (-2. * k80 / (mu2 * psi10 * n)) ** (1. / 11.) - - kw4 = kernel2.deriv4_6_8_10(xn / g3, numout=1) - kw = np.r_[kw4, 0, kw4[-1:0:-1]] - z = np.real(ifft(fft(c, nfft) * fft(kw))) - psi4 = np.sum(c * z[:inc]) / (n ** 2 * g3 ** 5) - - kw4, kw6, kw8 = kernel2.deriv4_6_8_10(xn / g3, numout=3) - kw = np.r_[kw8, 0, kw8[-1:0:-1]] - z = np.real(ifft(fft(c, nfft) * fft(kw))) - psi8 = np.sum(c * z[:inc]) / (n ** 2 * g4 ** 9) - - const = (441. / (64 * pi)) ** (1. / 18.) * \ - (4 * pi) ** (-1. / 5.) * \ - psi4 ** (-2. / 5.) * psi8 ** (-1. / 9.) - - M = np.atleast_2d(datan) - - Y = (M - M.T).ravel() - - for i in range(steps): - g = const * n ** (-23. / 45) * hvec[i] ** (-2) - sig1 = sqrt(2 * hvec[i] ** 2 + 2 * g ** 2) - sig2 = sqrt(hvec[i] ** 2 + 2 * g ** 2) - sig3 = sqrt(2 * g ** 2) - term2 = np.sum(kernel2(Y / sig1) / sig1 - 2 * kernel2( - Y / sig2) / sig2 + kernel2(Y / sig3) / sig3) - - score[i] = 1. / (n * hvec[i] * 2. * sqrt(pi)) + term2 / n ** 2 - - idx = score.argmin() - # Kernel other than Gaussian scale bandwidth - h[dim] = hvec[idx] * (STEconstant / STEconstant2) ** (1 / 5) - if idx == 0: - warnings.warn( - 'Optimum is probably lower than hs=%g for dim=%d' % - (h[dim] * s, dim)) - elif idx == maxit - 1: - warnings.warn( - 'Optimum is probably higher than hs=%g for dim=%d' % - (h[dim] * s, dim)) - - hvec = hvec * (STEconstant / STEconstant2) ** (1 / 5) - if fulloutput: - return h * sigmaA, score, hvec, sigmaA - else: - return h * sigmaA - - def hldpi(self, data, L=2, inc=128): - '''HLDPI L-stage Direct Plug-In estimate of smoothing parameter. - - CALL: hs = hldpi(data,kernel,L) - - hs = one dimensional value for smoothing parameter - given the data and kernel. size 1 x D - data = data matrix, size N x D (D = # dimensions ) - kernel = 'epanechnikov' - Epanechnikov kernel. - 'biweight' - Bi-weight kernel. - 'triweight' - Tri-weight kernel. - 'triangluar' - Triangular kernel. - 'gaussian' - Gaussian kernel - 'rectangular' - Rectanguler kernel. - 'laplace' - Laplace kernel. - 'logistic' - Logistic kernel. - L = 0,1,2,3,... (default 2) - - Note that only the first 4 letters of the kernel name is needed. - - Example: - x = rndnorm(0,1,50,1); - hs = hldpi(x,'gauss',1); - - See also hste, hbcv, hboot, hos, hlscv, hscv, hstt, kde, kdefun - - Wand,M.P. and Jones, M.C. (1995) - 'Kernel smoothing' - Chapman and Hall, pp 67--74 - ''' - A = np.atleast_2d(data) - d, n = A.shape - - # R= int(mkernel(x)^2), mu2= int(x^2*mkernel(x)) - mu2, R, unusedRdd = self.stats() - - AMISEconstant = (8 * sqrt(pi) * R / (3 * n * mu2 ** 2)) ** (1. / 5) - STEconstant = R / (n * mu2 ** 2) - - sigmaA = self.hns(A) / AMISEconstant - - nfft = inc * 2 - amin = A.min(axis=1) # Find the minimum value of A. - amax = A.max(axis=1) # Find the maximum value of A. - arange = amax - amin # Find the range of A. - - # xa holds the x 'axis' vector, defining a grid of x values where - # the k.d. function will be evaluated. - - ax1 = amin - arange / 8.0 - bx1 = amax + arange / 8.0 - - kernel2 = Kernel('gauss') - mu2, unusedR, unusedRdd = kernel2.stats() - - fft = np.fft.fft - ifft = np.fft.ifft - - h = np.zeros(d) - for dim in range(d): - s = sigmaA[dim] - datan = A[dim] # / s - ax = ax1[dim] # / s - bx = bx1[dim] # / s - - xa = np.linspace(ax, bx, inc) - xn = np.linspace(0, bx - ax, inc) - - c = gridcount(datan, xa) - - r = 2 * L + 4 - rd2 = L + 2 - - # Eq. 3.7 in Wand and Jones (1995) - PSI_r = (-1) ** (rd2) * np.prod( - np.r_[rd2 + 1:r + 1]) / (sqrt(pi) * (2 * s) ** (r + 1)) - PSI = PSI_r - if L > 0: - # High order derivatives of the Gaussian kernel - Kd = kernel2.deriv4_6_8_10(0, numout=L) - - # L-stage iterations to estimate PSI_4 - for ix in range(L, 0, -1): - gi = (-2 * Kd[ix - 1] / - (mu2 * PSI * n)) ** (1. / (2 * ix + 5)) - - # Obtain the kernel weights. - KW0 = kernel2.deriv4_6_8_10(xn / gi, numout=ix) - if ix > 1: - KW0 = KW0[-1] - # Apply 'fftshift' to kw. - kw = np.r_[KW0, 0, KW0[inc - 1:0:-1]] - - # Perform the convolution. - z = np.real(ifft(fft(c, nfft) * fft(kw))) - - PSI = np.sum(c * z[:inc]) / (n ** 2 * gi ** (2 * ix + 3)) - # end - # end - h[dim] = (STEconstant / PSI) ** (1. / 5) - return h - - def norm_factor(self, d=1, n=None): - return self.kernel.norm_factor(d, n) - - def eval_points(self, points): - return self.kernel(np.atleast_2d(points)) - __call__ = eval_points - - -def mkernel(X, kernel): - """MKERNEL Multivariate Kernel Function. - - Paramaters - ---------- - X : array-like - matrix size d x n (d = # dimensions, n = # evaluation points) - kernel : string - defining kernel - 'epanechnikov' - Epanechnikov kernel. - 'biweight' - Bi-weight kernel. - 'triweight' - Tri-weight kernel. - 'p1epanechnikov' - product of 1D Epanechnikov kernel. - 'p1biweight' - product of 1D Bi-weight kernel. - 'p1triweight' - product of 1D Tri-weight kernel. - 'triangular' - Triangular kernel. - 'gaussian' - Gaussian kernel - 'rectangular' - Rectangular kernel. - 'laplace' - Laplace kernel. - 'logistic' - Logistic kernel. - Note that only the first 4 letters of the kernel name is needed. - - Returns - ------- - z : ndarray - kernel function values evaluated at X - - See also - -------- - kde, kdefun, kdebin - - References - ---------- - B. W. Silverman (1986) - 'Density estimation for statistics and data analysis' - Chapman and Hall, pp. 43, 76 - - Wand, M. P. and Jones, M. C. (1995) - 'Density estimation for statistics and data analysis' - Chapman and Hall, pp 31, 103, 175 - - """ - fun = _MKERNEL_DICT[kernel[:4]] - return fun(np.atleast_2d(X)) - - -def accumsum(accmap, a, size, dtype=None): - if dtype is None: - dtype = a.dtype - size = np.atleast_1d(size) - if len(size) > 1: - binx = accmap[:, 0] - biny = accmap[:, 1] - out = sparse.coo_matrix( - (a.ravel(), (binx, biny)), shape=size, dtype=dtype).tocsr() - else: - binx = accmap.ravel() - zero = np.zeros(len(binx)) - out = sparse.coo_matrix( - (a.ravel(), (binx, zero)), shape=(size, 1), dtype=dtype).tocsr() - return out - - -def accumsum2(accmap, a, size): - return np.bincount(accmap.ravel(), a.ravel(), np.array(size).max()) - - -def accum(accmap, a, func=None, size=None, fill_value=0, dtype=None): - """An accumulation function similar to Matlab's `accumarray` function. - - Parameters - ---------- - accmap : ndarray - This is the "accumulation map". It maps input (i.e. indices into - `a`) to their destination in the output array. The first `a.ndim` - dimensions of `accmap` must be the same as `a.shape`. That is, - `accmap.shape[:a.ndim]` must equal `a.shape`. For example, if `a` - has shape (15,4), then `accmap.shape[:2]` must equal (15,4). In this - case `accmap[i,j]` gives the index into the output array where - element (i,j) of `a` is to be accumulated. If the output is, say, - a 2D, then `accmap` must have shape (15,4,2). The value in the - last dimension give indices into the output array. If the output is - 1D, then the shape of `accmap` can be either (15,4) or (15,4,1) - a : ndarray - The input data to be accumulated. - func : callable or None - The accumulation function. The function will be passed a list - of values from `a` to be accumulated. - If None, numpy.sum is assumed. - size : ndarray or None - The size of the output array. If None, the size will be determined - from `accmap`. - fill_value : scalar - The default value for elements of the output array. - dtype : numpy data type, or None - The data type of the output array. If None, the data type of - `a` is used. - - Returns - ------- - out : ndarray - The accumulated results. - - The shape of `out` is `size` if `size` is given. Otherwise the - shape is determined by the (lexicographically) largest indices of - the output found in `accmap`. - - - Examples - -------- - >>> from numpy import array, prod - >>> a = array([[1,2,3],[4,-1,6],[-1,8,9]]) - >>> a - array([[ 1, 2, 3], - [ 4, -1, 6], - [-1, 8, 9]]) - >>> # Sum the diagonals. - >>> accmap = array([[0,1,2],[2,0,1],[1,2,0]]) - >>> s = accum(accmap, a) - >>> s - array([ 9, 7, 15]) - >>> # A 2D output, from sub-arrays with shapes and positions like this: - >>> # [ (2,2) (2,1)] - >>> # [ (1,2) (1,1)] - >>> accmap = array([ - ... [[0,0],[0,0],[0,1]], - ... [[0,0],[0,0],[0,1]], - ... [[1,0],[1,0],[1,1]]]) - >>> # Accumulate using a product. - >>> accum(accmap, a, func=prod, dtype=float) - array([[ -8., 18.], - [ -8., 9.]]) - >>> # Same accmap, but create an array of lists of values. - >>> accum(accmap, a, func=lambda x: x, dtype='O') - array([[[1, 2, 4, -1], [3, 6]], - [[-1, 8], [9]]], dtype=object) - - """ - - def create_array_of_python_lists(accmap, a, size): - vals = np.empty(size, dtype='O') - for s in product(*[range(k) for k in size]): - vals[s] = [] - - for s in product(*[range(k) for k in a.shape]): - indx = tuple(accmap[s]) - val = a[s] - vals[indx].append(val) - - return vals - - # Check for bad arguments and handle the defaults. - if accmap.shape[:a.ndim] != a.shape: - raise ValueError( - "The initial dimensions of accmap must be the same as a.shape") - if func is None: - func = np.sum - if dtype is None: - dtype = a.dtype - if accmap.shape == a.shape: - accmap = np.expand_dims(accmap, -1) - adims = tuple(range(a.ndim)) - if size is None: - size = 1 + np.squeeze(np.apply_over_axes(np.max, accmap, axes=adims)) - size = np.atleast_1d(size) - - # Create an array of python lists of values. - vals = create_array_of_python_lists(accmap, a, size) - - # Create the output array. - out = np.empty(size, dtype=dtype) - for s in product(*[range(k) for k in size]): - if vals[s] == []: - out[s] = fill_value - else: - out[s] = func(vals[s]) - return out - - -def qlevels(pdf, p=(10, 30, 50, 70, 90, 95, 99, 99.9), x1=None, x2=None): - """QLEVELS Calculates quantile levels which encloses P% of PDF. - - CALL: [ql PL] = qlevels(pdf,PL,x1,x2); - - ql = the discrete quantile levels. - pdf = joint point density function matrix or vector - PL = percent level (default [10:20:90 95 99 99.9]) - x1,x2 = vectors of the spacing of the variables - (Default unit spacing) - - QLEVELS numerically integrates PDF by decreasing height and find the - quantile levels which encloses P% of the distribution. If X1 and - (or) X2 is unspecified it is assumed that dX1 and dX2 is constant. - NB! QLEVELS normalizes the integral of PDF to N/(N+0.001) before - calculating QL in order to reflect the sampling of PDF is finite. - Currently only able to handle 1D and 2D PDF's if dXi is not constant - (i=1,2). - - Example - ------- - >>> import wafo.stats as ws - >>> x = np.linspace(-8,8,2001); - >>> PL = np.r_[10:90:20, 90, 95, 99, 99.9] - >>> qlevels(ws.norm.pdf(x),p=PL, x1=x); - array([ 0.39591707, 0.37058719, 0.31830968, 0.23402133, 0.10362052, - 0.05862129, 0.01449505, 0.00178806]) - - # compared with the exact values - >>> ws.norm.pdf(ws.norm.ppf((100-PL)/200)) - array([ 0.39580488, 0.370399 , 0.31777657, 0.23315878, 0.10313564, - 0.05844507, 0.01445974, 0.00177719]) - - See also - -------- - qlevels2, tranproc - - """ - - norm = 1 # normalize cdf to unity - pdf = np.atleast_1d(pdf) - if any(pdf.ravel() < 0): - raise ValueError( - 'This is not a pdf since one or more values of pdf is negative') - - fsiz = pdf.shape - fsizmin = min(fsiz) - if fsizmin == 0: - return [] - - N = np.prod(fsiz) - d = len(fsiz) - if x1 is None or ((x2 is None) and d > 2): - fdfi = pdf.ravel() - else: - if d == 1: # pdf in one dimension - dx22 = np.ones(1) - else: # % pdf in two dimensions - dx2 = np.diff(x2.ravel()) * 0.5 - dx22 = np.r_[0, dx2] + np.r_[dx2, 0] - - dx1 = np.diff(x1.ravel()) * 0.5 - dx11 = np.r_[0, dx1] + np.r_[dx1, 0] - dx1x2 = dx22[:, None] * dx11 - fdfi = (pdf * dx1x2).ravel() - - p = np.atleast_1d(p) - - if np.any((p < 0) | (100 < p)): - raise ValueError('PL must satisfy 0 <= PL <= 100') - - p2 = p / 100.0 - ind = np.argsort(pdf.ravel()) # sort by height of pdf - ind = ind[::-1] - fi = pdf.flat[ind] - - # integration in the order of decreasing height of pdf - Fi = np.cumsum(fdfi[ind]) - - if norm: # %normalize Fi to make sure int pdf dx1 dx2 approx 1 - Fi = Fi / Fi[-1] * N / (N + 1.5e-8) - - maxFi = np.max(Fi) - if maxFi > 1: - warnings.warn('this is not a pdf since cdf>1! normalizing') - - Fi = Fi / Fi[-1] * N / (N + 1.5e-8) - - elif maxFi < .95: - msg = '''The given pdf is too sparsely sampled since cdf<.95. - Thus QL is questionable''' - warnings.warn(msg) - - # make sure Fi is strictly increasing by not considering duplicate values - ind, = np.where(np.diff(np.r_[Fi, 1]) > 0) - # calculating the inverse of Fi to find the index - ui = tranproc(Fi[ind], fi[ind], p2) - # to the desired quantile level - # ui=smooth(Fi(ind),fi(ind),1,p2(:),1) % alternative - # res=ui-ui2 - - if np.any(ui >= max(pdf.ravel())): - warnings.warn('The lowest percent level is too close to 0%') - - if np.any(ui <= min(pdf.ravel())): - msg = '''The given pdf is too sparsely sampled or - the highest percent level is too close to 100%''' - warnings.warn(msg) - ui[ui < 0] = 0.0 - - return ui - - -def qlevels2(data, p=(10, 30, 50, 70, 90, 95, 99, 99.9), method=1): - """QLEVELS2 Calculates quantile levels which encloses P% of data. - - CALL: [ql PL] = qlevels2(data,PL,method); - - ql = the discrete quantile levels, size D X Np - Parameters - ---------- - data : data matrix, size D x N (D = # of dimensions) - p : percent level vector, length Np (default [10:20:90 95 99 99.9]) - method : integer - 1 Interpolation so that F(X_(k)) == (k-0.5)/n. (default) - 2 Interpolation so that F(X_(k)) == k/(n+1). - 3 Based on the empirical distribution. - - Returns - ------- - - QLEVELS2 sort the columns of data in ascending order and find the - quantile levels for each column which encloses P% of the data. - - Examples : Finding quantile levels enclosing P% of data: - -------- - >>> import wafo.stats as ws - >>> PL = np.r_[10:90:20, 90, 95, 99, 99.9] - >>> xs = ws.norm.rvs(size=2500000) - >>> np.allclose(qlevels2(ws.norm.pdf(xs), p=PL), - ... [0.3958, 0.3704, 0.3179, 0.2331, 0.1031, 0.05841, 0.01451, 0.001751], - ... rtol=1e-1) - True - - # compared with the exact values - >>> ws.norm.pdf(ws.norm.ppf((100-PL)/200)) - array([ 0.39580488, 0.370399 , 0.31777657, 0.23315878, 0.10313564, - 0.05844507, 0.01445974, 0.00177719]) - - # Finding the median of xs: - >>> '%2.2f' % np.abs(qlevels2(xs,50)[0]) - '0.00' - - See also - -------- - qlevels - - """ - q = 100 - np.atleast_1d(p) - return percentile(data, q, axis=-1, method=method) - - -_PKDICT = {1: lambda k, w, n: (k - w) / (n - 1), - 2: lambda k, w, n: (k - w / 2) / n, - 3: lambda k, w, n: k / n, - 4: lambda k, w, n: k / (n + 1), - 5: lambda k, w, n: (k - w / 3) / (n + 1 / 3), - 6: lambda k, w, n: (k - w * 3 / 8) / (n + 1 / 4)} - - -def _compute_qth_weighted_percentile(a, q, axis, out, method, weights, - overwrite_input): - # normalise weight vector such that sum of the weight vector equals to n - q = np.atleast_1d(q) / 100.0 - if (q < 0).any() or (q > 1).any(): - raise ValueError("percentile must be in the range [0,100]") - - shape0 = a.shape - if axis is None: - sorted_ = a.ravel() - else: - taxes = [i for i in range(a.ndim)] - taxes[-1], taxes[axis] = taxes[axis], taxes[-1] - sorted_ = np.transpose(a, taxes).reshape(-1, shape0[axis]) - - ind = sorted_.argsort(axis=-1) - if overwrite_input: - sorted_.sort(axis=-1) - else: - sorted_ = np.sort(sorted_, axis=-1) - - w = np.atleast_1d(weights) - n = len(w) - w = w * n / w.sum() - - # Work on each column separately because of weight vector - m = sorted_.shape[0] - nq = len(q) - y = np.zeros((m, nq)) - pk_fun = _PKDICT.get(method, 1) - for i in range(m): - sortedW = w[ind[i]] # rearrange the weight according to ind - k = sortedW.cumsum() # cumulative weight - # different algorithm to compute percentile - pk = pk_fun(k, sortedW, n) - # Interpolation between pk and sorted_ for given value of q - y[i] = np.interp(q, pk, sorted_[i]) - if axis is None: - return np.squeeze(y) - else: - shape1 = list(shape0) - shape1[axis], shape1[-1] = shape1[-1], nq - return np.squeeze(np.transpose(y.reshape(shape1), taxes)) - -# method=1: p(k) = k/(n-1) -# method=2: p(k) = (k+0.5)/n. -# method=3: p(k) = (k+1)/n -# method=4: p(k) = (k+1)/(n+1) -# method=5: p(k) = (k+2/3)/(n+1/3) -# method=6: p(k) = (k+5/8)/(n+1/4) - -_KDICT = {1: lambda p, n: p * (n - 1), - 2: lambda p, n: p * n - 0.5, - 3: lambda p, n: p * n - 1, - 4: lambda p, n: p * (n + 1) - 1, - 5: lambda p, n: p * (n + 1. / 3) - 2. / 3, - 6: lambda p, n: p * (n + 1. / 4) - 5. / 8} - - -def _compute_qth_percentile(sorted_, q, axis, out, method): - if not np.isscalar(q): - p = [_compute_qth_percentile(sorted_, qi, axis, None, method) - for qi in q] - if out is not None: - out.flat = p - return p - - q = q / 100.0 - if (q < 0) or (q > 1): - raise ValueError("percentile must be in the range [0,100]") - - indexer = [slice(None)] * sorted_.ndim - Nx = sorted_.shape[axis] - k_fun = _KDICT.get(method, 1) - index = np.clip(k_fun(q, Nx), 0, Nx - 1) - i = int(index) - if i == index: - indexer[axis] = slice(i, i + 1) - weights1 = np.array(1) - sumval = 1.0 - else: - indexer[axis] = slice(i, i + 2) - j = i + 1 - weights1 = np.array([(j - index), (index - i)], float) - wshape = [1] * sorted_.ndim - wshape[axis] = 2 - weights1.shape = wshape - sumval = weights1.sum() - - # Use add.reduce in both cases to coerce data type as well as - # check and use out array. - return np.add.reduce(sorted_[indexer] * weights1, - axis=axis, out=out) / sumval - - -def percentile(a, q, axis=None, out=None, overwrite_input=False, method=1, - weights=None): - """Compute the qth percentile of the data along the specified axis. - - Returns the qth percentile of the array elements. - - Parameters - ---------- - a : array_like - Input array or object that can be converted to an array. - q : float in range of [0,100] (or sequence of floats) - percentile to compute which must be between 0 and 100 inclusive - axis : {None, int}, optional - Axis along which the percentiles are computed. The default (axis=None) - is to compute the median along a flattened version of the array. - out : ndarray, optional - Alternative output array in which to place the result. It must - have the same shape and buffer length as the expected output, - but the type (of the output) will be cast if necessary. - overwrite_input : {False, True}, optional - If True, then allow use of memory of input array (a) for - calculations. The input array will be modified by the call to - median. This will save memory when you do not need to preserve - the contents of the input array. Treat the input as undefined, - but it will probably be fully or partially sorted. Default is - False. Note that, if `overwrite_input` is True and the input - is not already an ndarray, an error will be raised. - method : scalar integer - defining the interpolation method. Valid options are - 1 : p[k] = k/(n-1). In this case, p[k] = mode[F(x[k])]. - This is used by S. (default) - 2 : p[k] = (k+0.5)/n. That is a piecewise linear function where - the knots are the values midway through the steps of the - empirical cdf. This is popular amongst hydrologists. - Matlab also uses this formula. - 3 : p[k] = (k+1)/n. That is, linear interpolation of the empirical cdf. - 4 : p[k] = (k+1)/(n+1). Thus p[k] = E[F(x[k])]. - This is used by Minitab and by SPSS. - 5 : p[k] = (k+2/3)/(n+1/3). Then p[k] =~ median[F(x[k])]. - The resulting quantile estimates are approximately - median-unbiased regardless of the distribution of x. - 6 : p[k] = (k+5/8)/(n+1/4). The resulting quantile estimates are - approximately unbiased for the expected order statistics - if x is normally distributed. - - Returns - ------- - pcntile : ndarray - A new array holding the result (unless `out` is specified, in - which case that array is returned instead). If the input contains - integers, or floats of smaller precision than 64, then the output - data-type is float64. Otherwise, the output data-type is the same - as that of the input. - - See Also - -------- - mean, median - - Notes - ----- - Given a vector V of length N, the qth percentile of V is the qth ranked - value in a sorted copy of V. A weighted average of the two nearest - neighbors is used if the normalized ranking does not match q exactly. - The same as the median if q is 0.5; the same as the min if q is 0; - and the same as the max if q is 1 - - Examples - -------- - >>> import wafo.kdetools as wk - >>> a = np.array([[10, 7, 4], [3, 2, 1]]) - >>> a - array([[10, 7, 4], - [ 3, 2, 1]]) - >>> wk.percentile(a, 50) - 3.5 - >>> wk.percentile(a, 50, axis=0) - array([ 6.5, 4.5, 2.5]) - >>> wk.percentile(a, 50, axis=0, weights=np.ones(2)) - array([ 6.5, 4.5, 2.5]) - >>> wk.percentile(a, 50, axis=1) - array([ 7., 2.]) - >>> wk.percentile(a, 50, axis=1, weights=np.ones(3)) - array([ 7., 2.]) - >>> m = wk.percentile(a, 50, axis=0) - >>> out = np.zeros_like(m) - >>> wk.percentile(a, 50, axis=0, out=m) - array([ 6.5, 4.5, 2.5]) - >>> m - array([ 6.5, 4.5, 2.5]) - >>> b = a.copy() - >>> wk.percentile(b, 50, axis=1, overwrite_input=True) - array([ 7., 2.]) - >>> assert not np.all(a==b) - >>> b = a.copy() - >>> wk.percentile(b, 50, axis=None, overwrite_input=True) - 3.5 - >>> np.all(a==b) - True - - """ - a = np.asarray(a) - try: - if q == 0: - return a.min(axis=axis, out=out) - elif q == 100: - return a.max(axis=axis, out=out) - except: - pass - if weights is not None: - return _compute_qth_weighted_percentile(a, q, axis, out, method, - weights, overwrite_input) - elif overwrite_input: - if axis is None: - sorted_ = np.sort(a, axis=axis) - else: - a.sort(axis=axis) - sorted_ = a - else: - sorted_ = np.sort(a, axis=axis) - if axis is None: - axis = 0 - - return _compute_qth_percentile(sorted_, q, axis, out, method) - - -def iqrange(data, axis=None): - """Returns the Inter Quartile Range of data. - - Parameters - ---------- - data : array-like - Input array or object that can be converted to an array. - axis : {None, int}, optional - Axis along which the percentiles are computed. The default (axis=None) - is to compute the median along a flattened version of the array. - - Returns - ------- - r : array-like - abs(np.percentile(data, 75, axis)-np.percentile(data, 25, axis)) - - Notes - ----- - IQRANGE is a robust measure of spread. The use of interquartile range - guards against outliers if the distribution have heavy tails. - - Example - ------- - >>> a = np.arange(101) - >>> iqrange(a) - 50.0 - - See also - -------- - np.std - - """ - return np.abs(np.percentile(data, 75, axis=axis) - - np.percentile(data, 25, axis=axis)) - - -def bitget(int_type, offset): - """Returns the value of the bit at the offset position in int_type. - - Example - ------- - >>> bitget(5, np.r_[0:4]) - array([1, 0, 1, 0]) - - """ - return np.bitwise_and(int_type, 1 << offset) >> offset - - -def gridcount(data, X, y=1): - ''' - Returns D-dimensional histogram using linear binning. - - Parameters - ---------- - data = column vectors with D-dimensional data, shape D x Nd - X = row vectors defining discretization, shape D x N - Must include the range of the data. - - Returns - ------- - c = gridcount, shape N x N x ... x N - - GRIDCOUNT obtains the grid counts using linear binning. - There are 2 strategies: simple- or linear- binning. - Suppose that an observation occurs at x and that the nearest point - below and above is y and z, respectively. Then simple binning strategy - assigns a unit weight to either y or z, whichever is closer. Linear - binning, on the other hand, assigns the grid point at y with the weight - of (z-x)/(z-y) and the gridpoint at z a weight of (y-x)/(z-y). - - In terms of approximation error of using gridcounts as pdf-estimate, - linear binning is significantly more accurate than simple binning. - - NOTE: The interval [min(X);max(X)] must include the range of the data. - The order of C is permuted in the same order as - meshgrid for D==2 or D==3. - - Example - ------- - >>> import numpy as np - >>> import wafo.kdetools as wk - >>> import pylab as plb - >>> N = 20 - >>> data = np.random.rayleigh(1,N) - >>> data = np.array( - ... [ 1.07855907, 1.51199717, 1.54382893, 1.54774808, 1.51913566, - ... 1.11386486, 1.49146216, 1.51127214, 2.61287913, 0.94793051, - ... 2.08532731, 1.35510641, 0.56759888, 1.55766981, 0.77883602, - ... 0.9135759 , 0.81177855, 1.02111483, 1.76334202, 0.07571454]) - >>> x = np.linspace(0,max(data)+1,50) - >>> dx = x[1]-x[0] - - >>> c = wk.gridcount(data, x) - >>> np.allclose(c[:5], [ 0., 0.9731147, 0.0268853, 0., 0.]) - True - - >>> pdf = c/dx/N - >>> np.allclose(np.trapz(pdf, x), 1) - True - - h = plb.plot(x,c,'.') # 1D histogram - h1 = plb.plot(x, pdf) # 1D probability density plot - - See also - -------- - bincount, accum, kdebin - - Reference - ---------- - Wand,M.P. and Jones, M.C. (1995) - 'Kernel smoothing' - Chapman and Hall, pp 182-192 - ''' - dat = np.atleast_2d(data) - x = np.atleast_2d(X) - y = np.atleast_1d(y).ravel() - d = dat.shape[0] - d1, inc = x.shape - - if d != d1: - raise ValueError('Dimension 0 of data and X do not match.') - - dx = np.diff(x[:, :2], axis=1) - xlo = x[:, 0] - xup = x[:, -1] - - datlo = dat.min(axis=1) - datup = dat.max(axis=1) - if ((datlo < xlo) | (xup < datup)).any(): - raise ValueError('X does not include whole range of the data!') - - csiz = np.repeat(inc, d) - use_sparse = False - if use_sparse: - acfun = accumsum # faster than accum - else: - acfun = accumsum2 # accum - - binx = np.asarray(np.floor((dat - xlo[:, newaxis]) / dx), dtype=int) - w = dx.prod() - abs = np.abs # @ReservedAssignment - if d == 1: - x.shape = (-1,) - c = np.asarray((acfun(binx, (x[binx + 1] - dat) * y, size=(inc, )) + - acfun(binx + 1, (dat - x[binx]) * y, size=(inc, ))) / - w).ravel() - else: # d>2 - - Nc = csiz.prod() - c = np.zeros((Nc,)) - - fact2 = np.asarray(np.reshape(inc * np.arange(d), (d, -1)), dtype=int) - fact1 = np.asarray( - np.reshape(csiz.cumprod() / inc, (d, -1)), dtype=int) - # fact1 = fact1(ones(n,1),:); - bt0 = [0, 0] - X1 = X.ravel() - for ir in range(2 ** (d - 1)): - bt0[0] = np.reshape(bitget(ir, np.arange(d)), (d, -1)) - bt0[1] = 1 - bt0[0] - for ix in range(2): - one = np.mod(ix, 2) - two = np.mod(ix + 1, 2) - # Convert to linear index - # linear index to c - b1 = np.sum((binx + bt0[one]) * fact1, axis=0) - bt2 = bt0[two] + fact2 - b2 = binx + bt2 # linear index to X - c += acfun( - b1, abs(np.prod(X1[b2] - dat, axis=0)) * y, size=(Nc,)) - - c = np.reshape(c / w, csiz, order='F') - - T = [i for i in range(d)] - T[1], T[0] = T[0], T[1] - # make sure c is stored in the same way as meshgrid - c = c.transpose(*T) - return c - - -def kde_demo1(): - """KDEDEMO1 Demonstrate the smoothing parameter impact on KDE. - - KDEDEMO1 shows the true density (dotted) compared to KDE based on 7 - observations (solid) and their individual kernels (dashed) for 3 - different values of the smoothing parameter, hs. - - """ - - import scipy.stats as st - x = np.linspace(-4, 4, 101) - x0 = x / 2.0 - data = np.random.normal(loc=0, scale=1.0, size=7) - kernel = Kernel('gauss') - hs = kernel.hns(data) - hVec = [hs / 2, hs, 2 * hs] - - for ix, h in enumerate(hVec): - plt.figure(ix) - kde = KDE(data, hs=h, kernel=kernel) - f2 = kde(x, output='plot', title='h_s = %2.2f' % h, ylab='Density') - f2.plot('k-') - - plt.plot(x, st.norm.pdf(x, 0, 1), 'k:') - n = len(data) - plt.plot(data, np.zeros(data.shape), 'bx') - y = kernel(x0) / (n * h * kernel.norm_factor(d=1, n=n)) - for i in range(n): - plt.plot(data[i] + x0 * h, y, 'b--') - plt.plot([data[i], data[i]], [0, np.max(y)], 'b') - - plt.axis([x.min(), x.max(), 0, 0.5]) - - -def kde_demo2(): - '''Demonstrate the difference between transformation- and ordinary-KDE. - - KDEDEMO2 shows that the transformation KDE is a better estimate for - Rayleigh distributed data around 0 than the ordinary KDE. - ''' - import scipy.stats as st - data = st.rayleigh.rvs(scale=1, size=300) - - x = np.linspace(1.5e-2, 5, 55) - - kde = KDE(data) - f = kde(output='plot', title='Ordinary KDE (hs=%g)' % kde.hs) - plt.figure(0) - f.plot() - - plt.plot(x, st.rayleigh.pdf(x, scale=1), ':') - - # plotnorm((data).^(L2)) % gives a straight line => L2 = 0.5 reasonable - - tkde = TKDE(data, L2=0.5) - ft = tkde(x, output='plot', title='Transformation KDE (hs=%g)' % - tkde.tkde.hs) - plt.figure(1) - ft.plot() - - plt.plot(x, st.rayleigh.pdf(x, scale=1), ':') - - plt.figure(0) - - -def kde_demo3(): - '''Demonstrate the difference between transformation and ordinary-KDE in 2D - - KDEDEMO3 shows that the transformation KDE is a better estimate for - Rayleigh distributed data around 0 than the ordinary KDE. - ''' - import scipy.stats as st - data = st.rayleigh.rvs(scale=1, size=(2, 300)) - - # x = np.linspace(1.5e-3, 5, 55) - - kde = KDE(data) - f = kde(output='plot', title='Ordinary KDE', plotflag=1) - plt.figure(0) - f.plot() - - plt.plot(data[0], data[1], '.') - - # plotnorm((data).^(L2)) % gives a straight line => L2 = 0.5 reasonable - - tkde = TKDE(data, L2=0.5) - ft = tkde.eval_grid_fast( - output='plot', title='Transformation KDE', plotflag=1) - - plt.figure(1) - ft.plot() - - plt.plot(data[0], data[1], '.') - - plt.figure(0) - - -def kde_demo4(N=50): - '''Demonstrate that the improved Sheather-Jones plug-in (hisj) is superior - for 1D multimodal distributions - - KDEDEMO4 shows that the improved Sheather-Jones plug-in smoothing is a - better compared to normal reference rules (in this case the hns) - ''' - import scipy.stats as st - - data = np.hstack((st.norm.rvs(loc=5, scale=1, size=(N,)), - st.norm.rvs(loc=-5, scale=1, size=(N,)))) - - # x = np.linspace(1.5e-3, 5, 55) - - kde = KDE(data, kernel=Kernel('gauss', 'hns')) - f = kde(output='plot', title='Ordinary KDE', plotflag=1) - - kde1 = KDE(data, kernel=Kernel('gauss', 'hisj')) - f1 = kde1(output='plot', label='Ordinary KDE', plotflag=1) - - plt.figure(0) - f.plot('r', label='hns=%g' % kde.hs) - # plt.figure(2) - f1.plot('b', label='hisj=%g' % kde1.hs) - x = np.linspace(-4, 4) - for loc in [-5, 5]: - plt.plot(x + loc, st.norm.pdf(x, 0, scale=1) / 2, 'k:', - label='True density') - plt.legend() - - -def kde_demo5(N=500): - '''Demonstrate that the improved Sheather-Jones plug-in (hisj) is superior - for 2D multimodal distributions - - KDEDEMO5 shows that the improved Sheather-Jones plug-in smoothing is better - compared to normal reference rules (in this case the hns) - ''' - import scipy.stats as st - - data = np.hstack((st.norm.rvs(loc=5, scale=1, size=(2, N,)), - st.norm.rvs(loc=-5, scale=1, size=(2, N,)))) - kde = KDE(data, kernel=Kernel('gauss', 'hns')) - f = kde(output='plot', title='Ordinary KDE (hns=%g %g)' % - tuple(kde.hs.tolist()), plotflag=1) - - kde1 = KDE(data, kernel=Kernel('gauss', 'hisj')) - f1 = kde1(output='plot', title='Ordinary KDE (hisj=%g %g)' % - tuple(kde1.hs.tolist()), plotflag=1) - - plt.figure(0) - plt.clf() - f.plot() - plt.plot(data[0], data[1], '.') - plt.figure(1) - plt.clf() - f1.plot() - plt.plot(data[0], data[1], '.') - - -def kreg_demo1(hs=None, fast=False, fun='hisj'): - """""" - N = 100 - # ei = np.random.normal(loc=0, scale=0.075, size=(N,)) - ei = np.array([ - -0.08508516, 0.10462496, 0.07694448, -0.03080661, 0.05777525, - 0.06096313, -0.16572389, 0.01838912, -0.06251845, -0.09186784, - -0.04304887, -0.13365788, -0.0185279, -0.07289167, 0.02319097, - 0.06887854, -0.08938374, -0.15181813, 0.03307712, 0.08523183, - -0.0378058, -0.06312874, 0.01485772, 0.06307944, -0.0632959, - 0.18963205, 0.0369126, -0.01485447, 0.04037722, 0.0085057, - -0.06912903, 0.02073998, 0.1174351, 0.17599277, -0.06842139, - 0.12587608, 0.07698113, -0.0032394, -0.12045792, -0.03132877, - 0.05047314, 0.02013453, 0.04080741, 0.00158392, 0.10237899, - -0.09069682, 0.09242174, -0.15445323, 0.09190278, 0.07138498, - 0.03002497, 0.02495252, 0.01286942, 0.06449978, 0.03031802, - 0.11754861, -0.02322272, 0.00455867, -0.02132251, 0.09119446, - -0.03210086, -0.06509545, 0.07306443, 0.04330647, 0.078111, - -0.04146907, 0.05705476, 0.02492201, -0.03200572, -0.02859788, - -0.05893749, 0.00089538, 0.0432551, 0.04001474, 0.04888828, - -0.17708392, 0.16478644, 0.1171006, 0.11664846, 0.01410477, - -0.12458953, -0.11692081, 0.0413047, -0.09292439, -0.07042327, - 0.14119701, -0.05114335, 0.04994696, -0.09520663, 0.04829406, - -0.01603065, -0.1933216, 0.19352763, 0.11819496, 0.04567619, - -0.08348306, 0.00812816, -0.00908206, 0.14528945, 0.02901065]) - x = np.linspace(0, 1, N) - - y0 = 2 * np.exp(-x ** 2 / (2 * 0.3 ** 2)) + \ - 3 * np.exp(-(x - 1) ** 2 / (2 * 0.7 ** 2)) - y = y0 + ei - kernel = Kernel('gauss', fun=fun) - hopt = kernel.hisj(x) - kreg = KRegression( - x, y, p=0, hs=hs, kernel=kernel, xmin=-2 * hopt, xmax=1 + 2 * hopt) - if fast: - kreg.__call__ = kreg.eval_grid_fast - - f = kreg(output='plot', title='Kernel regression', plotflag=1) - plt.figure(0) - f.plot(label='p=0') - - kreg.p = 1 - f1 = kreg(output='plot', title='Kernel regression', plotflag=1) - f1.plot(label='p=1') - # print(f1.data) - plt.plot(x, y, '.', label='data') - plt.plot(x, y0, 'k', label='True model') - plt.legend() - - plt.show() - - print(kreg.tkde.tkde.inv_hs) - print(kreg.tkde.tkde.hs) - -_TINY = np.finfo(float).machar.tiny -_REALMIN = np.finfo(float).machar.xmin -_REALMAX = np.finfo(float).machar.xmax -_EPS = np.finfo(float).eps - - -def _logit(p): - pc = p.clip(min=0, max=1) - return (np.log(pc) - np.log1p(-pc)).clip(min=-40, max=40) - - -def _logitinv(x): - return 1.0 / (np.exp(-x) + 1) - - -def _get_data(n=100, symmetric=False, loc1=1.1, scale1=0.6, scale2=1.0): - import scipy.stats as st - # from sg_filter import SavitzkyGolay - dist = st.norm - - norm1 = scale2 * (dist.pdf(-loc1, loc=-loc1, scale=scale1) + - dist.pdf(-loc1, loc=loc1, scale=scale1)) - - def fun1(x): - return ((dist.pdf(x, loc=-loc1, scale=scale1) + - dist.pdf(x, loc=loc1, scale=scale1)) / norm1).clip(max=1.0) - - x = np.sort(6 * np.random.rand(n, 1) - 3, axis=0) - - y = (fun1(x) > np.random.rand(n, 1)).ravel() - # y = (np.cos(x)>2*np.random.rand(n, 1)-1).ravel() - x = x.ravel() - - if symmetric: - xi = np.hstack((x.ravel(), -x.ravel())) - yi = np.hstack((y, y)) - i = np.argsort(xi) - x = xi[i] - y = yi[i] - return x, y, fun1 - - -def kreg_demo2(n=100, hs=None, symmetric=False, fun='hisj', plotlog=False): - x, y, fun1 = _get_data(n, symmetric) - kreg_demo3(x, y, fun1, hs=None, fun='hisj', plotlog=False) - - -def kreg_demo3(x, y, fun1, hs=None, fun='hisj', plotlog=False): - st = stats - - alpha = 0.1 - z0 = -_invnorm(alpha / 2) - - n = x.size - hopt, hs1, hs2 = _get_regression_smooting(x, y, fun='hos') - if hs is None: - hs = hopt - - forward = _logit - reverse = _logitinv - # forward = np.log - # reverse = np.exp - - xmin, xmax = x.min(), x.max() - ni = max(2 * int((xmax - xmin) / hopt) + 3, 5) - print(ni) - print(xmin, xmax) - sml = hopt * 0.1 - xi = np.linspace(xmin - sml, xmax + sml, ni) - xiii = np.linspace(xmin - sml, xmax + sml, 4 * ni + 1) - - c = gridcount(x, xi) - if (y == 1).any(): - c0 = gridcount(x[y == 1], xi) - else: - c0 = np.zeros(xi.shape) - yi = np.where(c == 0, 0, c0 / c) - - kreg = KRegression(x, y, hs=hs, p=0) - fiii = kreg(xiii) - yiii = interpolate.interp1d(xi, yi)(xiii) - fit = fun1(xiii).clip(max=1.0) - df = np.diff(fiii) - eerr = np.abs((yiii - fiii)).std() + 0.5 * (df[:-1] * df[1:] < 0).sum() / n - err = (fiii - fit).std() - f = kreg( - xiii, output='plotobj', - title='%s err=%1.3f,eerr=%1.3f, n=%d, hs=%1.3f, hs1=%1.3f, hs2=%1.3f' % - (fun, err, eerr, n, hs, hs1, hs2), plotflag=1) - - # yi[yi==0] = 1.0/(c[c!=0].min()+4) - # yi[yi==1] = 1-1.0/(c[c!=0].min()+4) - # yi[yi==0] = fi[yi==0] - # yi[yi==0] = np.exp(stineman_interp(xi[yi==0], xi[yi>0],np.log(yi[yi>0]))) - # yi[yi==0] = fun1(xi[yi==0]) - try: - yi[yi == 0] = yi[yi > 0].min() / sqrt(n) - except: - yi[yi == 0] = 1. / n - yi[yi == 1] = 1 - (1 - yi[yi < 1].max()) / sqrt(n) - - logity = forward(yi) - - gkreg = KRegression(xi, logity, hs=hs, xmin=xmin - hopt, xmax=xmax + hopt) - fg = gkreg.eval_grid( - xi, output='plotobj', title='Kernel regression', plotflag=1) - sa = (fg.data - logity).std() - sa2 = iqrange(fg.data - logity) / 1.349 - # print('sa=%g %g' % (sa, sa2)) - sa = min(sa, sa2) - -# plt.figure(1) -# plt.plot(xi, slogity-logity,'r.') -# plt.plot(xi, logity-,'b.') -# plt.plot(xi, fg.data-logity, 'b.') -# plt.show() -# return - - fg = gkreg.eval_grid( - xiii, output='plotobj', title='Kernel regression', plotflag=1) - pi = reverse(fg.data) - - dx = xi[1] - xi[0] - ckreg = KDE(x, hs=hs) - # ci = ckreg.eval_grid_fast(xi)*n*dx - ciii = ckreg.eval_grid_fast(xiii) * dx * x.size # n*(1+symmetric) - -# sa1 = np.sqrt(1./(ciii*pi*(1-pi))) -# plo3 = reverse(fg.data-z0*sa) -# pup3 = reverse(fg.data+z0*sa) - fg.data = pi - pi = f.data - - # ref Casella and Berger (1990) "Statistical inference" pp444 -# a = 2*pi + z0**2/(ciii+1e-16) -# b = 2*(1+z0**2/(ciii+1e-16)) -# plo2 = ((a-sqrt(a**2-2*pi**2*b))/b).clip(min=0,max=1) -# pup2 = ((a+sqrt(a**2-2*pi**2*b))/b).clip(min=0,max=1) - # Jeffreys intervall a=b=0.5 - # st.beta.isf(alpha/2, x+a, n-x+b) - ab = 0.07 # 0.055 - pi1 = pi # fun1(xiii) - pup2 = np.where(pi == 1, - 1, - st.beta.isf(alpha / 2, - ciii * pi1 + ab, - ciii * (1 - pi1) + ab)) - plo2 = np.where(pi == 0, - 0, - st.beta.isf(1 - alpha / 2, - ciii * pi1 + ab, - ciii * (1 - pi1) + ab)) - - averr = np.trapz(pup2 - plo2, xiii) / \ - (xiii[-1] - xiii[0]) + 0.5 * (df[:-1] * df[1:] < 0).sum() - - # f2 = kreg_demo4(x, y, hs, hopt) - # Wilson score - den = 1 + (z0 ** 2. / ciii) - xc = (pi1 + (z0 ** 2) / (2 * ciii)) / den - halfwidth = (z0 * sqrt((pi1 * (1 - pi1) / ciii) + - (z0 ** 2 / (4 * (ciii ** 2))))) / den - plo = (xc - halfwidth).clip(min=0) # wilson score - pup = (xc + halfwidth).clip(max=1.0) # wilson score - # pup = (pi + z0*np.sqrt(pi*(1-pi)/ciii)).clip(min=0,max=1) # dont use - # plo = (pi - z0*np.sqrt(pi*(1-pi)/ciii)).clip(min=0,max=1) - - # mi = kreg.eval_grid(x) - # sigma = (stineman_interp(x, xiii, pup)-stineman_interp(x, xiii, plo))/4 - # aic = np.abs((y-mi)/sigma).std()+ 0.5*(df[:-1]*df[1:]<0).sum()/n - # aic = np.abs((yiii-fiii)/(pup-plo)).std() + \ - # 0.5*(df[:-1]*df[1:]<0).sum() + \ - # ((yiii-pup).clip(min=0)-(yiii-plo).clip(max=0)).sum() - - k = (df[:-1] * df[1:] < 0).sum() # numpeaks - sigmai = (pup - plo) - aic = (((yiii - fiii) / sigmai) ** 2).sum() + \ - 2 * k * (k + 1) / np.maximum(ni - k + 1, 1) + \ - np.abs((yiii - pup).clip(min=0) - (yiii - plo).clip(max=0)).sum() - - # aic = (((yiii-fiii)/sigmai)**2).sum()+ 2*k*(k+1)/(ni-k+1) + \ - # np.abs((yiii-pup).clip(min=0)-(yiii-plo).clip(max=0)).sum() - - # aic = averr + ((yiii-pup).clip(min=0)-(yiii-plo).clip(max=0)).sum() - - fg.plot(label='KReg grid aic=%2.3f' % (aic)) - f.plot(label='KReg averr=%2.3f ' % (averr)) - labtxt = '%d CI' % (int(100 * (1 - alpha))) - plt.fill_between(xiii, pup, plo, alpha=0.20, - color='r', linestyle='--', label=labtxt) - plt.fill_between(xiii, pup2, plo2, alpha=0.20, color='b', - linestyle=':', label='%d CI2' % (int(100 * (1 - alpha)))) - plt.plot(xiii, fun1(xiii), 'r', label='True model') - plt.scatter(xi, yi, label='data') - print('maxp = %g' % (np.nanmax(f.data))) - print('hs = %g' % (kreg.tkde.tkde.hs)) - plt.legend() - h = plt.gca() - if plotlog: - plt.setp(h, yscale='log') - # plt.show() - return hs1, hs2 - - -def kreg_demo4(x, y, hs, hopt, alpha=0.05): - st = stats - - n = x.size - xmin, xmax = x.min(), x.max() - ni = max(2 * int((xmax - xmin) / hopt) + 3, 5) - - sml = hopt * 0.1 - xi = np.linspace(xmin - sml, xmax + sml, ni) - xiii = np.linspace(xmin - sml, xmax + sml, 4 * ni + 1) - - kreg = KRegression(x, y, hs=hs, p=0) - - dx = xi[1] - xi[0] - ciii = kreg.tkde.eval_grid_fast(xiii) * dx * x.size -# ckreg = KDE(x,hs=hs) -# ciiii = ckreg.eval_grid_fast(xiii)*dx* x.size #n*(1+symmetric) - - f = kreg(xiii, output='plotobj') # , plot_kwds=dict(plotflag=7)) - pi = f.data - - # Jeffreys intervall a=b=0.5 - # st.beta.isf(alpha/2, x+a, n-x+b) - ab = 0.07 # 0.5 - pi1 = pi - pup = np.where(pi1 == 1, 1, st.beta.isf( - alpha / 2, ciii * pi1 + ab, ciii * (1 - pi1) + ab)) - plo = np.where(pi1 == 0, 0, st.beta.isf( - 1 - alpha / 2, ciii * pi1 + ab, ciii * (1 - pi1) + ab)) - - # Wilson score - # z0 = -_invnorm(alpha/2) -# den = 1+(z0**2./ciii); -# xc=(pi1+(z0**2)/(2*ciii))/den; -# halfwidth=(z0*sqrt((pi1*(1-pi1)/ciii)+(z0**2/(4*(ciii**2)))))/den -# plo2 = (xc-halfwidth).clip(min=0) # wilson score -# pup2 = (xc+halfwidth).clip(max=1.0) # wilson score - # f.dataCI = np.vstack((plo,pup)).T - f.prediction_error_avg = np.trapz(pup - plo, xiii) / (xiii[-1] - xiii[0]) - fiii = f.data - - c = gridcount(x, xi) - if (y == 1).any(): - c0 = gridcount(x[y == 1], xi) - else: - c0 = np.zeros(xi.shape) - yi = np.where(c == 0, 0, c0 / c) - - f.children = [PlotData([plo, pup], xiii, plotmethod='fill_between', - plot_kwds=dict(alpha=0.2, color='r')), - PlotData(yi, xi, plotmethod='scatter', - plot_kwds=dict(color='r', s=5))] - - yiii = interpolate.interp1d(xi, yi)(xiii) - df = np.diff(fiii) - k = (df[:-1] * df[1:] < 0).sum() # numpeaks - sigmai = (pup - plo) - aicc = (((yiii - fiii) / sigmai) ** 2).sum() + \ - 2 * k * (k + 1) / np.maximum(ni - k + 1, 1) + \ - np.abs((yiii - pup).clip(min=0) - (yiii - plo).clip(max=0)).sum() - - f.aicc = aicc - f.labels.title = 'perr=%1.3f,aicc=%1.3f, n=%d, hs=%1.3f' % ( - f.prediction_error_avg, aicc, n, hs) - - return f - - -def check_kreg_demo3(): - - plt.ion() - k = 0 - for n in [50, 100, 300, 600, 4000]: - x, y, fun1 = _get_data( - n, symmetric=True, loc1=1.0, scale1=0.6, scale2=1.25) - k0 = k - - for fun in ['hste', ]: - hsmax, _hs1, _hs2 = _get_regression_smooting(x, y, fun=fun) - for hi in np.linspace(hsmax * 0.25, hsmax, 9): - plt.figure(k) - k += 1 - unused = kreg_demo3(x, y, fun1, hs=hi, fun=fun, plotlog=False) - - # kreg_demo2(n=n,symmetric=True,fun='hste', plotlog=False) - fig.tile(range(k0, k)) - plt.ioff() - plt.show() - - -def check_kreg_demo4(): - plt.ion() - # test_docstrings() - # kde_demo2() - # kreg_demo1(fast=True) - # kde_gauss_demo() - # kreg_demo2(n=120,symmetric=True,fun='hste', plotlog=True) - k = 0 - for _i, n in enumerate([100, 300, 600, 4000]): - x, y, fun1 = _get_data( - n, symmetric=True, loc1=0.1, scale1=0.6, scale2=0.75) - # k0 = k - hopt1, _h1, _h2 = _get_regression_smooting(x, y, fun='hos') - hopt2, _h1, _h2 = _get_regression_smooting(x, y, fun='hste') - hopt = sqrt(hopt1 * hopt2) - # hopt = _get_regression_smooting(x,y,fun='hos')[0] - for _j, fun in enumerate(['hste']): # , 'hisj', 'hns', 'hstt' - hsmax, _hs1, _hs2 = _get_regression_smooting(x, y, fun=fun) - - fmax = kreg_demo4(x, y, hsmax + 0.1, hopt) - for hi in np.linspace(hsmax * 0.1, hsmax, 55): - f = kreg_demo4(x, y, hi, hopt) - if f.aicc <= fmax.aicc: - fmax = f - plt.figure(k) - k += 1 - fmax.plot() - plt.plot(x, fun1(x), 'r') - - # kreg_demo2(n=n,symmetric=True,fun='hste', plotlog=False) - fig.tile(range(0, k)) - plt.ioff() - plt.show() - - -def check_regression_bin(): - plt.ion() - # test_docstrings() - # kde_demo2() - # kreg_demo1(fast=True) - # kde_gauss_demo() - # kreg_demo2(n=120,symmetric=True,fun='hste', plotlog=True) - k = 0 - for _i, n in enumerate([100, 300, 600, 4000]): - x, y, fun1 = _get_data( - n, symmetric=True, loc1=0.1, scale1=0.6, scale2=0.75) - fbest = regressionbin(x, y, alpha=0.05, color='g', label='Transit_D') - - figk = plt.figure(k) - ax = figk.gca() - k += 1 - fbest.labels.title = 'N = %d' % n - fbest.plot(axis=ax) - ax.plot(x, fun1(x), 'r') - ax.legend(frameon=False, markerscale=4) - # ax = plt.gca() - ax.set_yticklabels(ax.get_yticks() * 100.0) - ax.grid(True) - - fig.tile(range(0, k)) - plt.ioff() - plt.show() - - -def check_bkregression(): - plt.ion() - k = 0 - for _i, n in enumerate([50, 100, 300, 600]): - x, y, fun1 = _get_data( - n, symmetric=True, loc1=0.1, scale1=0.6, scale2=0.75) - bkreg = BKRegression(x, y) - fbest = bkreg.prb_search_best( - hsfun='hste', alpha=0.05, color='g', label='Transit_D') - - figk = plt.figure(k) - ax = figk.gca() - k += 1 -# fbest.score.plot(axis=ax) -# axsize = ax.axis() -# ax.vlines(fbest.hs,axsize[2]+1,axsize[3]) -# ax.set(yscale='log') - fbest.labels.title = 'N = %d' % n - fbest.plot(axis=ax) - ax.plot(x, fun1(x), 'r') - ax.legend(frameon=False, markerscale=4) - # ax = plt.gca() - ax.set_yticklabels(ax.get_yticks() * 100.0) - ax.grid(True) - - fig.tile(range(0, k)) - plt.ioff() - plt.show() - - -def _get_regression_smooting(x, y, fun='hste'): - hs1 = Kernel('gauss', fun=fun).get_smoothing(x) - # hx = np.median(np.abs(x-np.median(x)))/0.6745*(4.0/(3*n))**0.2 - if (y == 1).any(): - hs2 = Kernel('gauss', fun=fun).get_smoothing(x[y == 1]) - # hy = np.median(np.abs(y-np.mean(y)))/0.6745*(4.0/(3*n))**0.2 - else: - hs2 = 4 * hs1 - # hy = 4*hx - - # hy2 = Kernel('gauss', fun=fun).get_smoothing(y) - # kernel = Kernel('gauss',fun=fun) - # hopt = (hs1+2*hs2)/3 - # hopt = (hs1+4*hs2)/5 #kernel.get_smoothing(x) - # hopt = hs2 - hopt = sqrt(hs1 * hs2) - return hopt, hs1, hs2 - - -def empirical_bin_prb(x, y, hopt, color='r'): - """Returns empirical binomial probabiltity. - - Parameters - ---------- - x : ndarray - position ve - y : ndarray - binomial response variable (zeros and ones) - - Returns - ------- - P(x) : PlotData object - empirical probability - - """ - xmin, xmax = x.min(), x.max() - ni = max(2 * int((xmax - xmin) / hopt) + 3, 5) - - sml = hopt # *0.1 - xi = np.linspace(xmin - sml, xmax + sml, ni) - - c = gridcount(x, xi) - if (y == 1).any(): - c0 = gridcount(x[y == 1], xi) - else: - c0 = np.zeros(xi.shape) - yi = np.where(c == 0, 0, c0 / c) - return PlotData(yi, xi, plotmethod='scatter', - plot_kwds=dict(color=color, s=5)) - - -def smoothed_bin_prb(x, y, hs, hopt, alpha=0.05, color='r', label='', - bin_prb=None): - ''' - Parameters - ---------- - x,y - hs : smoothing parameter - hopt : spacing in empirical_bin_prb - alpha : confidence level - color : color of plot object - bin_prb : PlotData object with empirical bin prb - ''' - if bin_prb is None: - bin_prb = empirical_bin_prb(x, y, hopt, color) - - xi = bin_prb.args - yi = bin_prb.data - ni = len(xi) - dxi = xi[1] - xi[0] - - n = x.size - - xiii = np.linspace(xi[0], xi[-1], 10 * ni + 1) - - kreg = KRegression(x, y, hs=hs, p=0) - # expected number of data in each bin - ciii = kreg.tkde.eval_grid_fast(xiii) * dxi * n - - f = kreg(xiii, output='plotobj') # , plot_kwds=dict(plotflag=7)) - pi = f.data - - st = stats - # Jeffreys intervall a=b=0.5 - # st.beta.isf(alpha/2, x+a, n-x+b) - ab = 0.07 # 0.5 - pi1 = pi - pup = np.where(pi1 == 1, 1, st.beta.isf( - alpha / 2, ciii * pi1 + ab, ciii * (1 - pi1) + ab)) - plo = np.where(pi1 == 0, 0, st.beta.isf( - 1 - alpha / 2, ciii * pi1 + ab, ciii * (1 - pi1) + ab)) - - # Wilson score - # z0 = -_invnorm(alpha/2) -# den = 1+(z0**2./ciii); -# xc=(pi1+(z0**2)/(2*ciii))/den; -# halfwidth=(z0*sqrt((pi1*(1-pi1)/ciii)+(z0**2/(4*(ciii**2)))))/den -# plo2 = (xc-halfwidth).clip(min=0) # wilson score -# pup2 = (xc+halfwidth).clip(max=1.0) # wilson score - # f.dataCI = np.vstack((plo,pup)).T - f.prediction_error_avg = np.trapz(pup - plo, xiii) / (xiii[-1] - xiii[0]) - fiii = f.data - - f.plot_kwds['color'] = color - f.plot_kwds['linewidth'] = 2 - if label: - f.plot_kwds['label'] = label - f.children = [PlotData([plo, pup], xiii, plotmethod='fill_between', - plot_kwds=dict(alpha=0.2, color=color)), - bin_prb] - - yiii = interpolate.interp1d(xi, yi)(xiii) - df = np.diff(fiii) - k = (df[:-1] * df[1:] < 0).sum() # numpeaks - sigmai = (pup - plo) - aicc = (((yiii - fiii) / sigmai) ** 2).sum() + \ - 2 * k * (k + 1) / np.maximum(ni - k + 1, 1) + \ - np.abs((yiii - pup).clip(min=0) - (yiii - plo).clip(max=0)).sum() - - f.aicc = aicc - f.fun = kreg - f.labels.title = 'perr=%1.3f,aicc=%1.3f, n=%d, hs=%1.3f' % ( - f.prediction_error_avg, aicc, n, hs) - - return f - - -def regressionbin(x, y, alpha=0.05, color='r', label=''): - """Return kernel regression estimate for binomial data. - - Parameters - ---------- - x : arraylike - positions - y : arraylike - of 0 and 1 - - """ - - hopt1, _h1, _h2 = _get_regression_smooting(x, y, fun='hos') - hopt2, _h1, _h2 = _get_regression_smooting(x, y, fun='hste') - hopt = sqrt(hopt1 * hopt2) - - fbest = smoothed_bin_prb(x, y, hopt2 + 0.1, hopt, alpha, color, label) - bin_prb = fbest.children[-1] - for fun in ['hste']: # , 'hisj', 'hns', 'hstt' - hsmax, _hs1, _hs2 = _get_regression_smooting(x, y, fun=fun) - for hi in np.linspace(hsmax * 0.1, hsmax, 55): - f = smoothed_bin_prb(x, y, hi, hopt, alpha, color, label, bin_prb) - if f.aicc <= fbest.aicc: - fbest = f - # hbest = hi - return fbest - - -def kde_gauss_demo(n=50): - """KDEDEMO Demonstrate the KDEgauss. - - KDEDEMO1 shows the true density (dotted) compared to KDE based on 7 - observations (solid) and their individual kernels (dashed) for 3 - different values of the smoothing parameter, hs. - - """ - - st = stats - # x = np.linspace(-4, 4, 101) - # data = np.random.normal(loc=0, scale=1.0, size=n) - # data = np.random.exponential(scale=1.0, size=n) -# n1 = 128 -# I = (np.arange(n1)*pi)**2 *0.01*0.5 -# kw = exp(-I) -# plt.plot(idctn(kw)) -# return - # dist = st.norm - dist = st.expon - data = dist.rvs(loc=0, scale=1.0, size=n) - d, _N = np.atleast_2d(data).shape - - if d == 1: - plot_options = [dict(color='red'), dict( - color='green'), dict(color='black')] - else: - plot_options = [dict(colors='red'), dict(colors='green'), - dict(colors='black')] - - plt.figure(1) - kde0 = KDE(data, kernel=Kernel('gauss', 'hste')) - f0 = kde0.eval_grid_fast(output='plot', ylab='Density') - f0.plot(**plot_options[0]) - - kde1 = TKDE(data, kernel=Kernel('gauss', 'hisj'), L2=.5) - f1 = kde1.eval_grid_fast(output='plot', ylab='Density') - f1.plot(**plot_options[1]) - - kde2 = KDEgauss(data) - f2 = kde2(output='plot', ylab='Density') - x = f2.args - f2.plot(**plot_options[2]) - - fmax = dist.pdf(x, 0, 1).max() - if d == 1: - plt.plot(x, dist.pdf(x, 0, 1), 'k:') - plt.axis([x.min(), x.max(), 0, fmax]) - plt.show() - print(fmax / f2.data.max()) - format_ = ''.join(('%g, ') * d) - format_ = 'hs0=%s hs1=%s hs2=%s' % (format_, format_, format_) - print(format_ % tuple(kde0.hs.tolist() + - kde1.tkde.hs.tolist() + kde2.hs.tolist())) - print('inc0 = %d, inc1 = %d, inc2 = %d' % (kde0.inc, kde1.inc, kde2.inc)) - - -def test_kde(): - data = np.array([ - 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887, - 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, - 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, - 1.8919469, 0.72433808, 1.92973094, 0.44749838, 1.36508452]) - - x = np.linspace(0.01, max(data.ravel()) + 1, 10) - kde = TKDE(data, hs=0.5, L2=0.5) - _f = kde(x) - # f = array([1.03982714, 0.45839018, 0.39514782, 0.32860602, 0.26433318, - # 0.20717946, 0.15907684, 0.1201074 , 0.08941027, 0.06574882]) - - _f1 = kde.eval_grid(x) - # array([ 1.03982714, 0.45839018, 0.39514782, 0.32860602, 0.26433318, - # 0.20717946, 0.15907684, 0.1201074 , 0.08941027, 0.06574882]) - - _f2 = kde.eval_grid_fast(x) - # array([ 1.06437223, 0.46203314, 0.39593137, 0.32781899, 0.26276433, - # 0.20532206, 0.15723498, 0.11843998, 0.08797755, 0. ]) - - -def test_docstrings(): - import doctest - print('Testing docstrings in %s' % __file__) - doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) - - -if __name__ == '__main__': - test_docstrings() - # test_kde() - # check_bkregression() - # check_regression_bin() - # check_kreg_demo3() - # check_kreg_demo4() - - # test_smoothn_1d() - # test_smoothn_2d() - - # kde_demo2() - # kreg_demo1(fast=True) - # kde_gauss_demo() - # kreg_demo2(n=120,symmetric=True,fun='hste', plotlog=True) - # plt.show('hold') diff --git a/wafo/kdetools/__init__.py b/wafo/kdetools/__init__.py new file mode 100644 index 0000000..f113919 --- /dev/null +++ b/wafo/kdetools/__init__.py @@ -0,0 +1,3 @@ +from .kdetools import * #@PydevCodeAnalysisIgnore +from .gridding import * #@PydevCodeAnalysisIgnore +from .kernels import * #@PydevCodeAnalysisIgnore diff --git a/wafo/kdetools/gridding.py b/wafo/kdetools/gridding.py new file mode 100644 index 0000000..8afc712 --- /dev/null +++ b/wafo/kdetools/gridding.py @@ -0,0 +1,324 @@ +''' +Created on 15. des. 2016 + +@author: pab +''' +from __future__ import division +from scipy import sparse +import numpy as np +from wafo.testing import test_docstrings +from itertools import product + +__all__ = ['accum', 'gridcount'] + + +def bitget(int_type, offset): + """Returns the value of the bit at the offset position in int_type. + + Example + ------- + >>> bitget(5, np.r_[0:4]) + array([1, 0, 1, 0]) + + """ + return np.bitwise_and(int_type, 1 << offset) >> offset + + +def accumsum(accmap, a, shape, dtype=None): + """ + Example + ------- + >>> from numpy import array + >>> a = array([[1,2,3],[4,-1,6],[-1,8,9]]) + >>> a + array([[ 1, 2, 3], + [ 4, -1, 6], + [-1, 8, 9]]) + >>> # Sum the diagonals. + >>> accmap = array([[0,1,2],[2,0,1],[1,2,0]]) + >>> s = accumsum(accmap, a, (3,) + >>> s + array([ 9, 7, 15]) + + """ + if dtype is None: + dtype = a.dtype + shape = np.atleast_1d(shape) + if len(shape) > 1: + binx = accmap[:, 0] + biny = accmap[:, 1] + out = sparse.coo_matrix( + (a.ravel(), (binx, biny)), shape=shape, dtype=dtype).tocsr() + else: + binx = accmap.ravel() + zero = np.zeros(len(binx)) + out = sparse.coo_matrix( + (a.ravel(), (binx, zero)), shape=(shape, 1), dtype=dtype).tocsr() + return out + + +def accumsum2(accmap, a, shape): + """ + Example + ------- + >>> from numpy import array + >>> a = array([[1,2,3],[4,-1,6],[-1,8,9]]) + >>> a + array([[ 1, 2, 3], + [ 4, -1, 6], + [-1, 8, 9]]) + >>> # Sum the diagonals. + >>> accmap = array([[0,1,2],[2,0,1],[1,2,0]]) + >>> s = accumsum2(accmap, a, (3,) + >>> s + array([ 9, 7, 15]) + + """ + return np.bincount(accmap.ravel(), a.ravel(), np.array(shape).max()) + + +def accum(accmap, a, func=None, size=None, fill_value=0, dtype=None): + """An accumulation function similar to Matlab's `accumarray` function. + + Parameters + ---------- + accmap : ndarray + This is the "accumulation map". It maps input (i.e. indices into + `a`) to their destination in the output array. The first `a.ndim` + dimensions of `accmap` must be the same as `a.shape`. That is, + `accmap.shape[:a.ndim]` must equal `a.shape`. For example, if `a` + has shape (15,4), then `accmap.shape[:2]` must equal (15,4). In this + case `accmap[i,j]` gives the index into the output array where + element (i,j) of `a` is to be accumulated. If the output is, say, + a 2D, then `accmap` must have shape (15,4,2). The value in the + last dimension give indices into the output array. If the output is + 1D, then the shape of `accmap` can be either (15,4) or (15,4,1) + a : ndarray + The input data to be accumulated. + func : callable or None + The accumulation function. The function will be passed a list + of values from `a` to be accumulated. + If None, numpy.sum is assumed. + size : ndarray or None + The size of the output array. If None, the size will be determined + from `accmap`. + fill_value : scalar + The default value for elements of the output array. + dtype : numpy data type, or None + The data type of the output array. If None, the data type of + `a` is used. + + Returns + ------- + out : ndarray + The accumulated results. + + The shape of `out` is `size` if `size` is given. Otherwise the + shape is determined by the (lexicographically) largest indices of + the output found in `accmap`. + + + Examples + -------- + >>> from numpy import array, prod + >>> a = array([[1,2,3],[4,-1,6],[-1,8,9]]) + >>> a + array([[ 1, 2, 3], + [ 4, -1, 6], + [-1, 8, 9]]) + >>> # Sum the diagonals. + >>> accmap = array([[0,1,2],[2,0,1],[1,2,0]]) + >>> s = accum(accmap, a) + >>> s + array([ 9, 7, 15]) + >>> # A 2D output, from sub-arrays with shapes and positions like this: + >>> # [ (2,2) (2,1)] + >>> # [ (1,2) (1,1)] + >>> accmap = array([ + ... [[0,0],[0,0],[0,1]], + ... [[0,0],[0,0],[0,1]], + ... [[1,0],[1,0],[1,1]]]) + >>> # Accumulate using a product. + >>> accum(accmap, a, func=prod, dtype=float) + array([[ -8., 18.], + [ -8., 9.]]) + >>> # Same accmap, but create an array of lists of values. + >>> accum(accmap, a, func=lambda x: x, dtype='O') + array([[[1, 2, 4, -1], [3, 6]], + [[-1, 8], [9]]], dtype=object) + + """ + + def create_array_of_python_lists(accmap, a, size): + vals = np.empty(size, dtype='O') + for s in product(*[range(k) for k in size]): + vals[s] = [] + + for s in product(*[range(k) for k in a.shape]): + indx = tuple(accmap[s]) + val = a[s] + vals[indx].append(val) + + return vals + + # Check for bad arguments and handle the defaults. + if accmap.shape[:a.ndim] != a.shape: + raise ValueError( + "The initial dimensions of accmap must be the same as a.shape") + if func is None: + func = np.sum + if dtype is None: + dtype = a.dtype + if accmap.shape == a.shape: + accmap = np.expand_dims(accmap, -1) + adims = tuple(range(a.ndim)) + if size is None: + size = 1 + np.squeeze(np.apply_over_axes(np.max, accmap, axes=adims)) + size = np.atleast_1d(size) + + # Create an array of python lists of values. + vals = create_array_of_python_lists(accmap, a, size) + + # Create the output array. + out = np.empty(size, dtype=dtype) + for s in np.product(*[range(k) for k in size]): + if vals[s] == []: + out[s] = fill_value + else: + out[s] = func(vals[s]) + return out + + +def gridcount(data, X, y=1): + ''' + Returns D-dimensional histogram using linear binning. + + Parameters + ---------- + data = column vectors with D-dimensional data, shape D x Nd + X = row vectors defining discretization, shape D x N + Must include the range of the data. + + Returns + ------- + c = gridcount, shape N x N x ... x N + + GRIDCOUNT obtains the grid counts using linear binning. + There are 2 strategies: simple- or linear- binning. + Suppose that an observation occurs at x and that the nearest point + below and above is y and z, respectively. Then simple binning strategy + assigns a unit weight to either y or z, whichever is closer. Linear + binning, on the other hand, assigns the grid point at y with the weight + of (z-x)/(z-y) and the gridpoint at z a weight of (y-x)/(z-y). + + In terms of approximation error of using gridcounts as pdf-estimate, + linear binning is significantly more accurate than simple binning. + + NOTE: The interval [min(X);max(X)] must include the range of the data. + The order of C is permuted in the same order as + meshgrid for D==2 or D==3. + + Example + ------- + >>> import numpy as np + >>> import wafo.kdetools as wk + >>> import pylab as plb + >>> N = 20 + >>> data = np.random.rayleigh(1,N) + >>> data = np.array( + ... [ 1.07855907, 1.51199717, 1.54382893, 1.54774808, 1.51913566, + ... 1.11386486, 1.49146216, 1.51127214, 2.61287913, 0.94793051, + ... 2.08532731, 1.35510641, 0.56759888, 1.55766981, 0.77883602, + ... 0.9135759 , 0.81177855, 1.02111483, 1.76334202, 0.07571454]) + >>> x = np.linspace(0,max(data)+1,50) + >>> dx = x[1]-x[0] + + >>> c = wk.gridcount(data, x) + >>> np.allclose(c[:5], [ 0., 0.9731147, 0.0268853, 0., 0.]) + True + + >>> pdf = c/dx/N + >>> np.allclose(np.trapz(pdf, x), 1) + True + + h = plb.plot(x,c,'.') # 1D histogram + h1 = plb.plot(x, pdf) # 1D probability density plot + + See also + -------- + bincount, accum, kdebin + + Reference + ---------- + Wand,M.P. and Jones, M.C. (1995) + 'Kernel smoothing' + Chapman and Hall, pp 182-192 + ''' + dat = np.atleast_2d(data) + x = np.atleast_2d(X) + y = np.atleast_1d(y).ravel() + d = dat.shape[0] + d1, inc = x.shape + + if d != d1: + raise ValueError('Dimension 0 of data and X do not match.') + + dx = np.diff(x[:, :2], axis=1) + xlo = x[:, 0] + xup = x[:, -1] + + datlo = dat.min(axis=1) + datup = dat.max(axis=1) + if ((datlo < xlo) | (xup < datup)).any(): + raise ValueError('X does not include whole range of the data!') + + csiz = np.repeat(inc, d) + use_sparse = False + if use_sparse: + acfun = accumsum # faster than accum + else: + acfun = accumsum2 # accum + + binx = np.asarray(np.floor((dat - xlo[:, np.newaxis]) / dx), dtype=int) + w = dx.prod() + if d == 1: + x.shape = (-1,) + c = np.asarray((acfun(binx, (x[binx + 1] - dat) * y, shape=(inc, )) + + acfun(binx + 1, (dat - x[binx]) * y, shape=(inc, ))) / + w).ravel() + else: # d>2 + + Nc = csiz.prod() + c = np.zeros((Nc,)) + + fact2 = np.asarray(np.reshape(inc * np.arange(d), (d, -1)), dtype=int) + fact1 = np.asarray( + np.reshape(csiz.cumprod() / inc, (d, -1)), dtype=int) + # fact1 = fact1(ones(n,1),:); + bt0 = [0, 0] + X1 = X.ravel() + for ir in range(2 ** (d - 1)): + bt0[0] = np.reshape(bitget(ir, np.arange(d)), (d, -1)) + bt0[1] = 1 - bt0[0] + for ix in range(2): + one = np.mod(ix, 2) + two = np.mod(ix + 1, 2) + # Convert to linear index + # linear index to c + b1 = np.sum((binx + bt0[one]) * fact1, axis=0) + bt2 = bt0[two] + fact2 + b2 = binx + bt2 # linear index to X + c += acfun(b1, np.abs(np.prod(X1[b2] - dat, axis=0)) * y, + shape=(Nc,)) + + c = np.reshape(c / w, csiz, order='F') + + T = [i for i in range(d)] + T[1], T[0] = T[0], T[1] + # make sure c is stored in the same way as meshgrid + c = c.transpose(*T) + return c + + +if __name__ == '__main__': + test_docstrings(__file__) diff --git a/wafo/kdetools/kdetools.py b/wafo/kdetools/kdetools.py new file mode 100644 index 0000000..4129341 --- /dev/null +++ b/wafo/kdetools/kdetools.py @@ -0,0 +1,2139 @@ +#!/usr/bin/env python +# ------------------------------------------------------------------------- +# Name: kdetools +# Purpose: +# +# Author: pab +# +# Created: 01.11.2008 +# Copyright: (c) pab 2008 +# Licence: LGPL +# ------------------------------------------------------------------------- + +from __future__ import absolute_import, division +# from abc import ABCMeta, abstractmethod +import copy +import warnings +import numpy as np +import scipy.stats +from scipy import interpolate, linalg, special +from numpy import pi, sqrt, atleast_2d, exp, meshgrid +from wafo.misc import nextpow2 +from wafo.containers import PlotData +from wafo.dctpack import dctn, idctn # , dstn, idstn +from wafo.plotbackend import plotbackend as plt +from wafo.testing import test_docstrings +from wafo.kdetools.kernels import iqrange, qlevels, Kernel +from wafo.kdetools.gridding import gridcount +import time + +try: + from wafo import fig +except ImportError: + warnings.warn('fig import only supported on Windows') + +__all__ = ['TKDE', 'KDE', 'kde_demo1', 'kde_demo2', 'test_docstrings', + 'KRegression', 'KDEgauss'] + + +def _assert(cond, msg): + if not cond: + raise ValueError(msg) + + +def _invnorm(q): + return special.ndtri(q) + + +class _KDE(object): + + """ Kernel-Density Estimator base class. + + Parameters + ---------- + data : (# of dims, # of data)-array + datapoints to estimate from + hs : array-like (optional) + smooting parameter vector/matrix. + (default compute from data using kernel.get_smoothing function) + kernel : kernel function object. + kernel must have get_smoothing method + alpha : real scalar (optional) + sensitivity parameter (default 0 regular KDE) + A good choice might be alpha = 0.5 ( or 1/D) + alpha = 0 Regular KDE (hs is constant) + 0 < alpha <= 1 Adaptive KDE (Make hs change) + + Members + ------- + d : int + number of dimensions + n : int + number of datapoints + + Methods + ------- + kde.eval_grid_fast(x0, x1,..., xd) : array + evaluate the estimated pdf on meshgrid(x0, x1,..., xd) + kde.eval_grid(x0, x1,..., xd) : array + evaluate the estimated pdf on meshgrid(x0, x1,..., xd) + kde.eval_points(points) : array + evaluate the estimated pdf on a provided set of points + kde(x0, x1,..., xd) : array + same as kde.eval_grid(x0, x1,..., xd) + """ + + def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None, + xmax=None, inc=512): + self.dataset = atleast_2d(data) + self.kernel = kernel if kernel else Kernel('gauss') + self.xmin = xmin + self.xmax = xmax + self.hs = hs + self.inc = inc + self.alpha = alpha + self.initialize() + + @property + def n(self): + return self.dataset.shape[1] + + @property + def d(self): + return self.dataset.shape[0] + + @property + def sigma(self): + """minimum(stdev, 0.75 * interquartile-range)""" + iqr = iqrange(self.dataset, axis=-1) + sigma = np.minimum(np.std(self.dataset, axis=-1, ddof=1), iqr / 1.34) + return sigma + + @property + def xmin(self): + return self._xmin + + @xmin.setter + def xmin(self, xmin): + if xmin is None: + self._xmin = self.dataset.min(axis=-1) - 2 * self.sigma + else: + self._xmin = xmin * np.ones(self.d) + + @property + def xmax(self): + return self._xmax + + @xmax.setter + def xmax(self, xmax): + if xmax is None: + self._xmax = self.dataset.max(axis=-1) + 2 * self.sigma + else: + self._xmax = xmax * np.ones(self.d) + + def _replace_negatives_with_default_hs(self, h): + get_default_hs = self.kernel.get_smoothing + ind, = np.where(h <= 0) + for i in ind.tolist(): + h[i] = get_default_hs(self.dataset[i]) + + def _check_hs(self, h): + """make sure it has the correct dimension and replace negative vals""" + h = np.atleast_1d(h) + if (len(h.shape) == 1) or (self.d == 1): + h = h * np.ones(self.d) if max(h.shape) == 1 else h.reshape(self.d) + self._replace_negatives_with_default_hs(h) + return h + + def _invert_hs(self, h): + if (len(h.shape) == 1) or (self.d == 1): + determinant = h.prod() + inv_hs = np.diag(1.0 / h) + else: # fully general smoothing matrix + determinant = linalg.det(h) + _assert(0 < determinant, + 'bandwidth matrix h must be positive definit!') + inv_hs = linalg.inv(h) + return inv_hs, determinant + + @property + def hs(self): + return self._hs + + @hs.setter + def hs(self, h): + if h is None: + h = self.kernel.get_smoothing(self.dataset) + h = self._check_hs(h) + inv_hs, deth = self._invert_hs(h) + + self._norm_factor = deth * self.n + self._inv_hs = inv_hs + self._hs = h + + @property + def inc(self): + return self._inc + + @inc.setter + def inc(self, inc): + if inc is None: + _tau, tau = self.kernel.effective_support() + xyzrange = 8 * self.sigma + L1 = 10 + inc = max(48, (L1 * xyzrange / (tau * self.hs)).max()) + inc = 2 ** nextpow2(inc) + self._inc = inc + + @property + def alpha(self): + return self._alpha + + @alpha.setter + def alpha(self, alpha): + self._alpha = alpha + self._lambda = np.ones(self.n) + if alpha > 0: + f = self.eval_points(self.dataset) # pilot estimate + g = np.exp(np.mean(np.log(f))) + self._lambda = (f / g) ** (-alpha) + + def initialize(self): + if self.n > 1: + self._initialize() + + def _initialize(self): + pass + + def get_args(self, xmin=None, xmax=None): + if xmin is None: + xmin = self.xmin + else: + xmin = [min(i, j) for i, j in zip(xmin, self.xmin)] + if xmax is None: + xmax = self.xmax + else: + xmax = [max(i, j) for i, j in zip(xmax, self.xmax)] + args = [] + inc = self.inc + for i in range(self.d): + args.append(np.linspace(xmin[i], xmax[i], inc)) + return args + + def eval_grid_fast(self, *args, **kwds): + """Evaluate the estimated pdf on a grid. + + Parameters + ---------- + arg_0,arg_1,... arg_d-1 : vectors + Alternatively, if no vectors is passed in then + arg_i = linspace(self.xmin[i], self.xmax[i], self.inc) + output : string optional + 'value' if value output + 'data' if object output + + Returns + ------- + values : array-like + The values evaluated at meshgrid(*args). + + """ + if len(args) == 0: + args = self.get_args() + self.args = args + return self._eval_grid_fun(self._eval_grid_fast, *args, **kwds) + + def _eval_grid_fast(self, *args, **kwds): + pass + + def eval_grid(self, *args, **kwds): + """Evaluate the estimated pdf on a grid. + + Parameters + ---------- + arg_0,arg_1,... arg_d-1 : vectors + Alternatively, if no vectors is passed in then + arg_i = linspace(self.xmin[i], self.xmax[i], self.inc) + output : string optional + 'value' if value output + 'data' if object output + + Returns + ------- + values : array-like + The values evaluated at meshgrid(*args). + + """ + if len(args) == 0: + args = self.get_args() + self.args = args + return self._eval_grid_fun(self._eval_grid, *args, **kwds) + + def _eval_grid(self, *args, **kwds): + pass + + def _add_contour_levels(self, wdata): + p_levels = np.r_[10:90:20, 95, 99, 99.9] + try: + c_levels = qlevels(wdata.data, p=p_levels) + wdata.clevels = c_levels + wdata.plevels = p_levels + except Exception as e: + msg = "Could not calculate contour levels!. ({})".format(str(e)) + warnings.warn(msg) + + def _make_object(self, f, **kwds): + titlestr = 'Kernel density estimate ({})'.format(self.kernel.name) + kwds2 = dict(title=titlestr) + kwds2['plot_kwds'] = dict(plotflag=1) + kwds2.update(**kwds) + args = self.args + if self.d == 1: + args = args[0] + wdata = PlotData(f, args, **kwds2) + if self.d > 1: + self._add_contour_levels(wdata) + return wdata + + def _eval_grid_fun(self, eval_grd, *args, **kwds): + output = kwds.pop('output', 'value') + f = eval_grd(*args, **kwds) + if output == 'value': + return f + return self._make_object(f, **kwds) + + def _check_shape(self, points): + points = atleast_2d(points) + d, m = points.shape + if d != self.d: + _assert(d == 1 and m == self.d, "points have dimension {}, " + "dataset has dimension {}".format(d, self.d)) + # points was passed in as a row vector + points = np.reshape(points, (self.d, 1)) + return points + + def eval_points(self, points, **kwds): + """Evaluate the estimated pdf on a set of points. + + Parameters + ---------- + points : (# of dimensions, # of points)-array + Alternatively, a (# of dimensions,) vector can be passed in and + treated as a single point. + + Returns + ------- + values : (# of points,)-array + The values at each point. + + Raises + ------ + ValueError if the dimensionality of the input points is different than + the dimensionality of the KDE. + + """ + + points = self._check_shape(points) + return self._eval_points(points, **kwds) + + def _eval_points(self, points, **kwds): + pass + + __call__ = eval_grid + + +class TKDE(_KDE): + + """ Transformation Kernel-Density Estimator. + + Parameters + ---------- + dataset : (# of dims, # of data)-array + datapoints to estimate from + hs : array-like (optional) + smooting parameter vector/matrix. + (default compute from data using kernel.get_smoothing function) + kernel : kernel function object. + kernel must have get_smoothing method + alpha : real scalar (optional) + sensitivity parameter (default 0 regular KDE) + A good choice might be alpha = 0.5 ( or 1/D) + alpha = 0 Regular KDE (hs is constant) + 0 < alpha <= 1 Adaptive KDE (Make hs change) + xmin, xmax : vectors + specifying the default argument range for the kde.eval_grid methods. + For the kde.eval_grid_fast methods the values must cover the range of + the data. (default min(data)-range(data)/4, max(data)-range(data)/4) + If a single value of xmin or xmax is given then the boundary is the is + the same for all dimensions. + inc : scalar integer + defining the default dimension of the output from kde.eval_grid methods + (default 512) + (For kde.eval_grid_fast: A value below 50 is very fast to compute but + may give some inaccuracies. Values between 100 and 500 give very + accurate results) + L2 : array-like + vector of transformation parameters (default 1 no transformation) + t(xi;L2) = xi^L2*sign(L2) for L2(i) ~= 0 + t(xi;L2) = log(xi) for L2(i) == 0 + If single value of L2 is given then the transformation is the same in + all directions. + + Members + ------- + d : int + number of dimensions + n : int + number of datapoints + + Methods + ------- + kde.eval_grid_fast(x0, x1,..., xd) : array + evaluate the estimated pdf on meshgrid(x0, x1,..., xd) + kde.eval_grid(x0, x1,..., xd) : array + evaluate the estimated pdf on meshgrid(x0, x1,..., xd) + kde.eval_points(points) : array + evaluate the estimated pdf on a provided set of points + kde(x0, x1,..., xd) : array + same as kde.eval_grid(x0, x1,..., xd) + + Example + ------- + N = 20 + data = np.random.rayleigh(1, size=(N,)) + >>> data = np.array([ + ... 0.75355792, 0.72779194, 0.94149169, 0.07841119,2.32291887, + ... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, + ... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, + ... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452]) + + >>> import wafo.kdetools as wk + >>> x = np.linspace(0.01, max(data.ravel()) + 1, 10) + >>> kde = wk.TKDE(data, hs=0.5, L2=0.5) + >>> f = kde(x) + >>> f + array([ 1.03982714, 0.45839018, 0.39514782, 0.32860602, 0.26433318, + 0.20717946, 0.15907684, 0.1201074 , 0.08941027, 0.06574882]) + + >>> kde.eval_grid(x) + array([ 1.03982714, 0.45839018, 0.39514782, 0.32860602, 0.26433318, + 0.20717946, 0.15907684, 0.1201074 , 0.08941027, 0.06574882]) + + >>> kde.eval_grid_fast(x) + array([ 1.04018924, 0.45838973, 0.39514689, 0.32860532, 0.26433301, + 0.20717976, 0.15907697, 0.1201077 , 0.08941129, 0.06574899]) + + import pylab as plb + h1 = plb.plot(x, f) # 1D probability density plot + t = np.trapz(f, x) + """ + + def __init__(self, data, hs=None, kernel=None, alpha=0.0, xmin=None, + xmax=None, inc=512, L2=None): + self.L2 = L2 + super(TKDE, self).__init__(data, hs, kernel, alpha, xmin, xmax, inc) + + def _initialize(self): + self._check_xmin() + tdataset = self._dat2gaus(self.dataset) + xmin = self.xmin + if xmin is not None: + xmin = self._dat2gaus(np.reshape(xmin, (-1, 1))) + xmax = self.xmax + if xmax is not None: + xmax = self._dat2gaus(np.reshape(xmax, (-1, 1))) + self.tkde = KDE(tdataset, self.hs, self.kernel, self.alpha, xmin, xmax, + self.inc) + if self.inc is None: + self.inc = self.tkde.inc + + def _check_xmin(self): + if self.L2 is not None: + amin = self.dataset.min(axis=-1) + # default no transformation + L2 = np.atleast_1d(self.L2) * np.ones(self.d) + self.xmin = np.where(L2 != 1, + np.maximum(self.xmin, amin / 100.0), + self.xmin).reshape((-1, 1)) + + def _dat2gaus(self, points): + if self.L2 is None: + return points # default no transformation + + # default no transformation + L2 = np.atleast_1d(self.L2) * np.ones(self.d) + + tpoints = copy.copy(points) + for i, v2 in enumerate(L2.tolist()): + tpoints[i] = np.log(points[i]) if v2 == 0 else points[i] ** v2 + return tpoints + + def _gaus2dat(self, tpoints): + if self.L2 is None: + return tpoints # default no transformation + + # default no transformation + L2 = np.atleast_1d(self.L2) * np.ones(self.d) + + points = copy.copy(tpoints) + for i, v2 in enumerate(L2.tolist()): + points[i] = np.exp( + tpoints[i]) if v2 == 0 else tpoints[i] ** (1.0 / v2) + return points + + def _scale_pdf(self, pdf, points): + if self.L2 is None: + return pdf + # default no transformation + L2 = np.atleast_1d(self.L2) * np.ones(self.d) + for i, v2 in enumerate(L2.tolist()): + factor = v2 * np.sign(v2) if v2 else 1 + pdf *= np.where(v2 == 1, 1, points[i] ** (v2 - 1) * factor) + if (np.abs(np.diff(pdf)).max() > 10).any(): + msg = ''' Numerical problems may have occured due to the power + transformation. Check the KDE for spurious spikes''' + warnings.warn(msg) + return pdf + + def eval_grid_fast2(self, *args, **kwds): + """Evaluate the estimated pdf on a grid. + + Parameters + ---------- + arg_0,arg_1,... arg_d-1 : vectors + Alternatively, if no vectors is passed in then + arg_i = gauss2dat(linspace(dat2gauss(self.xmin[i]), + dat2gauss(self.xmax[i]), self.inc)) + output : string optional + 'value' if value output + 'data' if object output + + Returns + ------- + values : array-like + The values evaluated at meshgrid(*args). + + """ + return self._eval_grid_fun(self._eval_grid_fast, *args, **kwds) + + def _interpolate(self, points, f, *args, **kwds): + ipoints = meshgrid(*args) if self.d > 1 else args + for i in range(self.d): + points[i].shape = -1, + points = np.asarray(points).T + + fi = interpolate.griddata(points, np.ravel(f), tuple(ipoints), + method='linear', fill_value=0.0) + self.args = args + r = kwds.get('r', 0) + if r == 0: + return fi * (fi > 0) + return fi + + def _eval_grid_fast(self, *args, **kwds): + if self.L2 is None: + f = self.tkde.eval_grid_fast(*args, **kwds) + self.args = self.tkde.args + return f + targs = [] + if len(args): + targs0 = self._dat2gaus(list(args)) + xmin = [min(t) for t in targs0] + xmax = [max(t) for t in targs0] + targs = self.tkde.get_args(xmin, xmax) + tf = self.tkde.eval_grid_fast(*targs) + self.args = self._gaus2dat(list(self.tkde.args)) + points = meshgrid(*self.args) if self.d > 1 else self.args + f = self._scale_pdf(tf, points) + if len(args): + return self._interpolate(points, f, *args, **kwds) + return f + + def _eval_grid(self, *args, **kwds): + if self.L2 is None: + return self.tkde.eval_grid(*args, **kwds) + targs = self._dat2gaus(list(args)) + tf = self.tkde.eval_grid(*targs, **kwds) + points = meshgrid(*args) if self.d > 1 else list(args) + f = self._scale_pdf(tf, points) + return f + + def _eval_points(self, points): + """Evaluate the estimated pdf on a set of points. + + Parameters + ---------- + points : (# of dimensions, # of points)-array + Alternatively, a (# of dimensions,) vector can be passed in and + treated as a single point. + + Returns + ------- + values : (# of points,)-array + The values at each point. + + Raises + ------ + ValueError if the dimensionality of the input points is different than + the dimensionality of the KDE. + + """ + if self.L2 is None: + return self.tkde.eval_points(points) + + tpoints = self._dat2gaus(points) + tf = self.tkde.eval_points(tpoints) + f = self._scale_pdf(tf, points) + return f + + +class KDE(_KDE): + + """ Kernel-Density Estimator. + + Parameters + ---------- + data : (# of dims, # of data)-array + datapoints to estimate from + hs : array-like (optional) + smooting parameter vector/matrix. + (default compute from data using kernel.get_smoothing function) + kernel : kernel function object. + kernel must have get_smoothing method + alpha : real scalar (optional) + sensitivity parameter (default 0 regular KDE) + A good choice might be alpha = 0.5 ( or 1/D) + alpha = 0 Regular KDE (hs is constant) + 0 < alpha <= 1 Adaptive KDE (Make hs change) + xmin, xmax : vectors + specifying the default argument range for the kde.eval_grid methods. + For the kde.eval_grid_fast methods the values must cover the range of + the data. + (default min(data)-range(data)/4, max(data)-range(data)/4) + If a single value of xmin or xmax is given then the boundary is the is + the same for all dimensions. + inc : scalar integer (default 512) + defining the default dimension of the output from kde.eval_grid methods + (For kde.eval_grid_fast: A value below 50 is very fast to compute but + may give some inaccuracies. Values between 100 and 500 give very + accurate results) + + Members + ------- + d : int + number of dimensions + n : int + number of datapoints + + Methods + ------- + kde.eval_grid_fast(x0, x1,..., xd) : array + evaluate the estimated pdf on meshgrid(x0, x1,..., xd) + kde.eval_grid(x0, x1,..., xd) : array + evaluate the estimated pdf on meshgrid(x0, x1,..., xd) + kde.eval_points(points) : array + evaluate the estimated pdf on a provided set of points + kde(x0, x1,..., xd) : array + same as kde.eval_grid(x0, x1,..., xd) + + Example + ------- + N = 20 + data = np.random.rayleigh(1, size=(N,)) + >>> data = np.array([ + ... 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887, + ... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, + ... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, + ... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452]) + + >>> x = np.linspace(0, max(data.ravel()) + 1, 10) + >>> import wafo.kdetools as wk + >>> kde = wk.KDE(data, hs=0.5, alpha=0.5) + >>> f = kde(x) + >>> f + array([ 0.17252055, 0.41014271, 0.61349072, 0.57023834, 0.37198073, + 0.21409279, 0.12738463, 0.07460326, 0.03956191, 0.01887164]) + + >>> kde.eval_grid(x) + array([ 0.17252055, 0.41014271, 0.61349072, 0.57023834, 0.37198073, + 0.21409279, 0.12738463, 0.07460326, 0.03956191, 0.01887164]) + >>> kde.eval_grid_fast(x) + array([ 0.21720891, 0.43308789, 0.59017626, 0.55847998, 0.39681482, + 0.23987473, 0.13113066, 0.06062029, 0.02160104, 0.00559028]) + + >>> kde0 = wk.KDE(data, hs=0.5, alpha=0.0) + >>> kde0.eval_points(x) + array([ 0.2039735 , 0.40252503, 0.54595078, 0.52219649, 0.3906213 , + 0.26381501, 0.16407362, 0.08270612, 0.02991145, 0.00720821]) + + >>> kde0.eval_grid(x) + array([ 0.2039735 , 0.40252503, 0.54595078, 0.52219649, 0.3906213 , + 0.26381501, 0.16407362, 0.08270612, 0.02991145, 0.00720821]) + >>> f = kde0.eval_grid(x, output='plotobj') + >>> f.data + array([ 0.2039735 , 0.40252503, 0.54595078, 0.52219649, 0.3906213 , + 0.26381501, 0.16407362, 0.08270612, 0.02991145, 0.00720821]) + + >>> f = kde0.eval_grid_fast() + >>> np.allclose(np.interp(x, kde0.args[0], f), + ... [ 0.20398034, 0.40252166, 0.54593292, 0.52218993, 0.39062245, + ... 0.26381651, 0.16407487, 0.08270847, 0.02991439, 0.00882095]) + True + >>> f1 = kde0.eval_grid_fast(output='plot') + >>> np.allclose(np.interp(x, f1.args, f1.data), + ... [ 0.20398034, 0.40252166, 0.54593292, 0.52218993, 0.39062245, + ... 0.26381651, 0.16407487, 0.08270847, 0.02991439, 0.00882095]) + True + + h = f1.plot() + import pylab as plb + h1 = plb.plot(x, f) # 1D probability density plot + t = np.trapz(f, x) + """ + + def _eval_grid_fast(self, *args, **kwds): + X = np.vstack(args) + d, inc = X.shape + dx = X[:, 1] - X[:, 0] + + Xn = [] + nfft0 = 2 * inc + nfft = (nfft0,) * d + x0 = np.linspace(-inc, inc, nfft0 + 1) + for i in range(d): + Xn.append(x0[:-1] * dx[i]) + + Xnc = meshgrid(*Xn) # if d > 1 else Xn + + shape0 = Xnc[0].shape + for i in range(d): + Xnc[i].shape = (-1,) + + Xn = np.dot(self._inv_hs, np.vstack(Xnc)) + + # Obtain the kernel weights. + kw = self.kernel(Xn) + norm_fact0 = (kw.sum() * dx.prod() * self.n) + norm_fact = (self._norm_factor * self.kernel.norm_factor(d, self.n)) + if np.abs(norm_fact0 - norm_fact) > 0.05 * norm_fact: + warnings.warn( + 'Numerical inaccuracy due to too low discretization. ' + + 'Increase the discretization of the evaluation grid ' + + '(inc={})!'.format(inc)) + norm_fact = norm_fact0 + + kw = kw / norm_fact + r = kwds.get('r', 0) + if r != 0: + kw *= np.vstack(Xnc) ** r if d > 1 else Xnc[0] ** r + kw.shape = shape0 + kw = np.fft.ifftshift(kw) + fftn = np.fft.fftn + ifftn = np.fft.ifftn + + y = kwds.get('y', 1.0) + if self.alpha > 0: + y = y / self._lambda**d + + # Find the binned kernel weights, c. + c = gridcount(self.dataset, X, y=y) + # Perform the convolution. + z = np.real(ifftn(fftn(c, s=nfft) * fftn(kw))) +# opt = dict(type=1, norm=None) +# z = idctn(dctn(c, shape=(inc,)*d, **opt) * dctn(kw[:inc], **opt), +# **opt)/(inc-1)/2 +# # if r is odd +# op2 = dict(type=3, norm=None) +# z3 = idstn(dctn(c, shape=(inc,)*d, **op2) * dstn(kw[1:inc+1], **op2), +# **op2)/(inc-1)/2 + + ix = (slice(0, inc),) * d + if r == 0: + return z[ix] * (z[ix] > 0.0) + return z[ix] + + def _eval_grid(self, *args, **kwds): + + grd = meshgrid(*args) if len(args) > 1 else list(args) + shape0 = grd[0].shape + d = len(grd) + for i in range(d): + grd[i] = grd[i].ravel() + f = self.eval_points(np.vstack(grd), **kwds) + return f.reshape(shape0) + + def _moment_fun(self, r): + if r == 0: + return lambda x: 1 + return lambda x: (x ** r).sum(axis=0) + + @property + def norm_factor(self): + return self._norm_factor * self.kernel.norm_factor(self.d, self.n) + + def _loop_over_data(self, data, points, y, r): + fun = self._moment_fun(r) + d, m = points.shape + inv_hs, lambda_ = self._inv_hs, self._lambda + kernel = self.kernel + + y_d_lambda = y / lambda_ ** d + result = np.zeros((m,)) + for i in range(self.n): + dxi = points - data[:, i, np.newaxis] + tdiff = np.dot(inv_hs / lambda_[i], dxi) + result += fun(dxi) * kernel(tdiff) * y_d_lambda[i] + return result / self.norm_factor + + def _loop_over_points(self, data, points, y, r): + fun = self._moment_fun(r) + d, m = points.shape + inv_hs, lambda_ = self._inv_hs, self._lambda + kernel = self.kernel + + y_d_lambda = y / lambda_ ** d + result = np.zeros((m,)) + for i in range(m): + dxi = points[:, i, np.newaxis] - data + tdiff = np.dot(inv_hs, dxi / lambda_[np.newaxis, :]) + result[i] = np.sum(fun(dxi) * kernel(tdiff) * y_d_lambda, axis=-1) + return result / self.norm_factor + + def _eval_points(self, points, **kwds): + """Evaluate the estimated pdf on a set of points. + + Parameters + ---------- + points : (# of dimensions, # of points)-array + Alternatively, a (# of dimensions,) vector can be passed in and + treated as a single point. + + Returns + ------- + values : (# of points,)-array + The values at each point. + + Raises + ------ + ValueError if the dimensionality of the input points is different than + the dimensionality of the KDE. + + """ + d, m = points.shape + _assert(d == self.d, "d={} expected, got {}".format(self.d, d)) + + y = kwds.get('y', 1) + r = kwds.get('r', 0) + + more_points_than_data = m >= self.n + if more_points_than_data: + return self._loop_over_data(self.dataset, points, y, r) + return self._loop_over_points(self.dataset, points, y, r) + + +class KDEgauss(KDE): + + """ Kernel-Density Estimator base class. + + data : (# of dims, # of data)-array + datapoints to estimate from + hs : array-like (optional) + smooting parameter vector/matrix. + (default compute from data using kernel.get_smoothing function) + kernel : kernel function object. + kernel must have get_smoothing method + alpha : real scalar (optional) + sensitivity parameter (default 0 regular KDE) + A good choice might be alpha = 0.5 ( or 1/D) + alpha = 0 Regular KDE (hs is constant) + 0 < alpha <= 1 Adaptive KDE (Make hs change) + xmin, xmax : vectors + specifying the default argument range for the kde.eval_grid methods. + For the kde.eval_grid_fast methods the values must cover the range of + the data. + (default min(data)-range(data)/4, max(data)-range(data)/4) + If a single value of xmin or xmax is given, then the boundary is the + the same for all dimensions. + inc : scalar integer (default 512) + defining the default dimension of the output from kde.eval_grid methods + (For kde.eval_grid_fast: A value below 50 is very fast to compute but + may give some inaccuracies. Values between 100 and 500 give very + accurate results) + + Members + ------- + d : int + number of dimensions + n : int + number of datapoints + + Methods + ------- + kde.eval_grid_fast(x0, x1,..., xd) : array + evaluate the estimated pdf on meshgrid(x0, x1,..., xd) + kde(x0, x1,..., xd) : array + same as kde.eval_grid_fast(x0, x1,..., xd) + """ + def _eval_grid_fast(self, *args, **kwds): + X = np.vstack(args) + d, inc = X.shape + # dx = X[:, 1] - X[:, 0] + R = X.max(axis=-1) - X.min(axis=-1) + + t_star = (self.hs / R) ** 2 + I = (np.asfarray(np.arange(0, inc)) * pi) ** 2 + In = [] + for i in range(d): + In.append(I * t_star[i] * 0.5) + + r = kwds.get('r', 0) + fun = self._moment_fun(r) + + Inc = meshgrid(*In) if d > 1 else In + kw = np.zeros((inc,) * d) + for i in range(d): + kw += exp(-Inc[i]) * fun(Inc[i]) + + y = kwds.get('y', 1.0) + d, n = self.dataset.shape + # Find the binned kernel weights, c. + c = gridcount(self.dataset, X, y=y) + # Perform the convolution. + at = dctn(c) * kw / n + z = idctn(at) * (at.size-1) / np.prod(R) + return z * (z > 0.0) + + __call__ = _KDE.eval_grid_fast + + +class KRegression(_KDE): + + """ Kernel-Regression + + Parameters + ---------- + data : (# of dims, # of data)-array + datapoints to estimate from + y : # of data - array + response variable + p : scalar integer (0 or 1) + Nadaraya-Watson estimator if p=0, + local linear estimator if p=1. + hs : array-like (optional) + smooting parameter vector/matrix. + (default compute from data using kernel.get_smoothing function) + kernel : kernel function object. + kernel must have get_smoothing method + alpha : real scalar (optional) + sensitivity parameter (default 0 regular KDE) + A good choice might be alpha = 0.5 ( or 1/D) + alpha = 0 Regular KDE (hs is constant) + 0 < alpha <= 1 Adaptive KDE (Make hs change) + xmin, xmax : vectors + specifying the default argument range for the kde.eval_grid methods. + For the kde.eval_grid_fast methods the values must cover the range of + the data. (default min(data)-range(data)/4, max(data)-range(data)/4) + If a single value of xmin or xmax is given then the boundary is the is + the same for all dimensions. + inc : scalar integer (default 128) + defining the default dimension of the output from kde.eval_grid methods + (For kde.eval_grid_fast: A value below 50 is very fast to compute but + may give some inaccuracies. Values between 100 and 500 give very + accurate results) + + Members + ------- + d : int + number of dimensions + n : int + number of datapoints + + Methods + ------- + kde.eval_grid_fast(x0, x1,..., xd) : array + evaluate the estimated pdf on meshgrid(x0, x1,..., xd) + kde.eval_grid(x0, x1,..., xd) : array + evaluate the estimated pdf on meshgrid(x0, x1,..., xd) + kde.eval_points(points) : array + evaluate the estimated pdf on a provided set of points + kde(x0, x1,..., xd) : array + same as kde.eval_grid(x0, x1,..., xd) + + + Example + ------- + >>> import wafo.kdetools as wk + >>> N = 100 + >>> x = np.linspace(0, 1, N) + >>> ei = np.random.normal(loc=0, scale=0.075, size=(N,)) + >>> ei = np.sqrt(0.075) * np.sin(100*x) + + >>> y = 2*np.exp(-x**2/(2*0.3**2))+3*np.exp(-(x-1)**2/(2*0.7**2)) + ei + >>> kreg = wk.KRegression(x, y) + >>> f = kreg(output='plotobj', title='Kernel regression', plotflag=1) + >>> np.allclose(f.data[:5], + ... [ 3.18670593, 3.18678088, 3.18682196, 3.18682932, 3.18680337]) + True + + h = f.plot(label='p=0') + """ + + def __init__(self, data, y, p=0, hs=None, kernel=None, alpha=0.0, + xmin=None, xmax=None, inc=128, L2=None): + + self.tkde = TKDE(data, hs=hs, kernel=kernel, + alpha=alpha, xmin=xmin, xmax=xmax, inc=inc, L2=L2) + self.y = y + self.p = p + + def eval_grid_fast(self, *args, **kwds): + self._grdfun = self.tkde.eval_grid_fast + return self.tkde._eval_grid_fun(self._eval_gridfun, *args, **kwds) + + def eval_grid(self, *args, **kwds): + self._grdfun = self.tkde.eval_grid + return self.tkde._eval_grid_fun(self._eval_gridfun, *args, **kwds) + + def _eval_gridfun(self, *args, **kwds): + grdfun = self._grdfun + s0 = grdfun(*args, r=0) + t0 = grdfun(*args, r=0, y=self.y) + if self.p == 0: + return (t0 / (s0 + _TINY)).clip(min=-_REALMAX, max=_REALMAX) + elif self.p == 1: + s1 = grdfun(*args, r=1) + s2 = grdfun(*args, r=2) + t1 = grdfun(*args, r=1, y=self.y) + return ((s2 * t0 - s1 * t1) / + (s2 * s0 - s1 ** 2)).clip(min=-_REALMAX, max=_REALMAX) + __call__ = eval_grid_fast + + +class BKRegression(object): + + ''' + Kernel-Regression on binomial data + + method : {'beta', 'wilson'} + method is one of the following + 'beta', return Bayesian Credible interval using beta-distribution. + 'wilson', return Wilson score interval + a, b : scalars + parameters of the beta distribution defining the apriori distribution + of p, i.e., the Bayes estimator for p: p = (y+a)/(n+a+b). + Setting a=b=0.5 gives Jeffreys interval. + ''' + + def __init__(self, *args, **kwds): + self.method = kwds.pop('method', 'beta') + self.a = max(kwds.pop('a', 0.5), _TINY) + self.b = max(kwds.pop('b', 0.5), _TINY) + self.kreg = KRegression(*args, **kwds) + # defines bin width (i.e. smoothing) in empirical estimate + self.hs_e = None +# self.x = self.kreg.tkde.dataset +# self.y = self.kreg.y + + def _set_smoothing(self, hs): + self.kreg.tkde.hs = hs + self.kreg.tkde.initialize() + + x = property(fget=lambda cls: cls.kreg.tkde.dataset.squeeze()) + y = property(fget=lambda cls: cls.kreg.y) + kernel = property(fget=lambda cls: cls.kreg.tkde.kernel) + hs = property(fset=_set_smoothing, fget=lambda cls: cls.kreg.tkde.hs) + + def _get_max_smoothing(self, fun=None): + """Return maximum value for smoothing parameter.""" + x = self.x + y = self.y + if fun is None: + get_smoothing = self.kernel.get_smoothing + else: + get_smoothing = getattr(self.kernel, fun) + + hs1 = get_smoothing(x) + # hx = np.median(np.abs(x-np.median(x)))/0.6745*(4.0/(3*n))**0.2 + if (y == 1).any(): + hs2 = get_smoothing(x[y == 1]) + # hy = np.median(np.abs(y-np.mean(y)))/0.6745*(4.0/(3*n))**0.2 + else: + hs2 = 4 * hs1 + # hy = 4*hx + + hopt = sqrt(hs1 * hs2) + return hopt, hs1, hs2 + + def get_grid(self, hs_e=None): + if hs_e is None: + if self.hs_e is None: + hs1 = self._get_max_smoothing('hste')[0] + hs2 = self._get_max_smoothing('hos')[0] + self.hs_e = sqrt(hs1 * hs2) + hs_e = self.hs_e + x = self.x + xmin, xmax = x.min(), x.max() + ni = max(2 * int((xmax - xmin) / hs_e) + 3, 5) + sml = hs_e # *0.1 + xi = np.linspace(xmin - sml, xmax + sml, ni) + return xi + + def prb_ci(self, n, p, alpha=0.05, **kwds): + """Return Confidence Interval for the binomial probability p. + + Parameters + ---------- + n : array-like + number of Bernoulli trials + p : array-like + estimated probability of success in each trial + alpha : scalar + confidence level + method : {'beta', 'wilson'} + method is one of the following + 'beta', return Bayesian Credible interval using beta-distribution. + 'wilson', return Wilson score interval + a, b : scalars + parameters of the beta distribution defining the apriori + distribution of p, i.e., + the Bayes estimator for p: p = (y+a)/(n+a+b). + Setting a=b=0.5 gives Jeffreys interval. + + """ + if self.method.startswith('w'): + # Wilson score + z0 = -_invnorm(alpha / 2) + den = 1 + (z0 ** 2. / n) + xc = (p + (z0 ** 2) / (2 * n)) / den + halfwidth = (z0 * sqrt((p * (1 - p) / n) + + (z0 ** 2 / (4 * (n ** 2))))) / den + plo = (xc - halfwidth).clip(min=0) # wilson score + pup = (xc + halfwidth).clip(max=1.0) # wilson score + else: + # Jeffreys intervall a=b=0.5 + # st.beta.isf(alpha/2, y+a, n-y+b) y = n*p, n-y = n*(1-p) + a = self.a + b = self.b + st = scipy.stats + pup = np.where(p == 1, 1, + st.beta.isf(alpha / 2, n * p + a, n * (1 - p) + b)) + plo = np.where(p == 0, 0, + st.beta.isf(1 - alpha / 2, + n * p + a, n * (1 - p) + b)) + return plo, pup + + def prb_empirical(self, xi=None, hs_e=None, alpha=0.05, color='r', **kwds): + """Returns empirical binomial probabiltity. + + Parameters + ---------- + x : ndarray + position vector + y : ndarray + binomial response variable (zeros and ones) + alpha : scalar + confidence level + color: + used in plot + + Returns + ------- + P(x) : PlotData object + empirical probability + + """ + if xi is None: + xi = self.get_grid(hs_e) + + x = self.x + y = self.y + + c = gridcount(x, xi) # + self.a + self.b # count data + if (y == 1).any(): + c0 = gridcount(x[y == 1], xi) # + self.a # count success + else: + c0 = np.zeros(np.shape(xi)) + prb = np.where(c == 0, 0, c0 / (c + _TINY)) # assume prb==0 for c==0 + CI = np.vstack(self.prb_ci(c, prb, alpha, **kwds)) + + prb_e = PlotData(prb, xi, plotmethod='plot', plot_args=['.'], + plot_kwds=dict(markersize=6, color=color, picker=5)) + prb_e.dataCI = CI.T + prb_e.count = c + return prb_e + + def prb_smoothed(self, prb_e, hs, alpha=0.05, color='r', label=''): + """Return smoothed binomial probability. + + Parameters + ---------- + prb_e : PlotData object with empirical binomial probabilites + hs : smoothing parameter + alpha : confidence level + color : color of plot object + label : label for plot object + + """ + + x_e = prb_e.args + n_e = len(x_e) + dx_e = x_e[1] - x_e[0] + n = self.x.size + + x_s = np.linspace(x_e[0], x_e[-1], 10 * n_e + 1) + self.hs = hs + + prb_s = self.kreg(x_s, output='plotobj', title='', plot_kwds=dict( + color=color, linewidth=2)) # dict(plotflag=7)) + m_nan = np.isnan(prb_s.data) + if m_nan.any(): # assume 0/0 division + prb_s.data[m_nan] = 0.0 + + # prb_s.data[np.isnan(prb_s.data)] = 0 + # expected number of data in each bin + c_s = self.kreg.tkde.eval_grid_fast(x_s) * dx_e * n + plo, pup = self.prb_ci(c_s, prb_s.data, alpha) + + prb_s.dataCI = np.vstack((plo, pup)).T + prb_s.prediction_error_avg = np.trapz( + pup - plo, x_s) / (x_s[-1] - x_s[0]) + + if label: + prb_s.plot_kwds['label'] = label + prb_s.children = [PlotData([plo, pup], x_s, + plotmethod='fill_between', + plot_kwds=dict(alpha=0.2, color=color)), + prb_e] + + # empirical oversmooths the data +# p_s = prb_s.eval_points(self.x) +# dp_s = np.diff(prb_s.data) +# k = (dp_s[:-1]*dp_s[1:]<0).sum() # numpeaks +# p_e = self.y +# n_s = interpolate.interp1d(x_s, c_s)(self.x) +# plo, pup = self.prb_ci(n_s, p_s, alpha) +# sigmai = (pup-plo) +# aicc = (((p_e-p_s)/sigmai)**2).sum()+ 2*k*(k+1)/np.maximum(n-k+1,1) + + p_e = prb_e.eval_points(x_s) + p_s = prb_s.data + dp_s = np.sign(np.diff(p_s)) + k = (dp_s[:-1] != dp_s[1:]).sum() # numpeaks + + # sigmai = (pup-plo)+_EPS + # aicc = (((p_e-p_s)/sigmai)**2).sum()+ 2*k*(k+1)/np.maximum(n_e-k+1,1) + # + np.abs((p_e-pup).clip(min=0)-(p_e-plo).clip(max=0)).sum() + sigmai = _logit(pup) - _logit(plo) + _EPS + aicc = ((((_logit(p_e) - _logit(p_s)) / sigmai) ** 2).sum() + + 2 * k * (k + 1) / np.maximum(n_e - k + 1, 1) + + np.abs((p_e - pup).clip(min=0) - + (p_e - plo).clip(max=0)).sum()) + + prb_s.aicc = aicc + # prb_s.labels.title = '' + # prb_s.labels.title='perr=%1.3f,aicc=%1.3f, n=%d, hs=%1.3f' % + # (prb_s.prediction_error_avg,aicc,n,hs) + + return prb_s + + def prb_search_best(self, prb_e=None, hsvec=None, hsfun='hste', + alpha=0.05, color='r', label=''): + """Return best smoothed binomial probability. + + Parameters + ---------- + prb_e : PlotData object with empirical binomial probabilites + hsvec : arraylike (default np.linspace(hsmax*0.1,hsmax,55)) + vector smoothing parameters + hsfun : + method for calculating hsmax + + """ + if prb_e is None: + prb_e = self.prb_empirical( + hs_e=self.hs_e, alpha=alpha, color=color) + if hsvec is None: + hsmax = self._get_max_smoothing(hsfun)[0] # @UnusedVariable + hsmax = max(hsmax, self.hs_e) + hsvec = np.linspace(hsmax * 0.2, hsmax, 55) + + hs_best = hsvec[-1] + 0.1 + prb_best = self.prb_smoothed(prb_e, hs_best, alpha, color, label) + aicc = np.zeros(np.size(hsvec)) + for i, hi in enumerate(hsvec): + f = self.prb_smoothed(prb_e, hi, alpha, color, label) + aicc[i] = f.aicc + if f.aicc <= prb_best.aicc: + prb_best = f + hs_best = hi + prb_best.score = PlotData(aicc, hsvec) + prb_best.hs = hs_best + self._set_smoothing(hs_best) + return prb_best + + +def kde_demo1(): + """KDEDEMO1 Demonstrate the smoothing parameter impact on KDE. + + KDEDEMO1 shows the true density (dotted) compared to KDE based on 7 + observations (solid) and their individual kernels (dashed) for 3 + different values of the smoothing parameter, hs. + + """ + st = scipy.stats + x = np.linspace(-4, 4, 101) + x0 = x / 2.0 + data = np.random.normal(loc=0, scale=1.0, size=7) + kernel = Kernel('gauss') + hs = kernel.hns(data) + hVec = [hs / 2, hs, 2 * hs] + + for ix, h in enumerate(hVec): + plt.figure(ix) + kde = KDE(data, hs=h, kernel=kernel) + f2 = kde(x, output='plot', title='h_s = {0:2.2f}'.format(h), + ylab='Density') + f2.plot('k-') + + plt.plot(x, st.norm.pdf(x, 0, 1), 'k:') + n = len(data) + plt.plot(data, np.zeros(data.shape), 'bx') + y = kernel(x0) / (n * h * kernel.norm_factor(d=1, n=n)) + for i in range(n): + plt.plot(data[i] + x0 * h, y, 'b--') + plt.plot([data[i], data[i]], [0, np.max(y)], 'b') + + plt.axis([min(x), max(x), 0, 0.5]) + + +def kde_demo2(): + '''Demonstrate the difference between transformation- and ordinary-KDE. + + KDEDEMO2 shows that the transformation KDE is a better estimate for + Rayleigh distributed data around 0 than the ordinary KDE. + ''' + st = scipy.stats + data = st.rayleigh.rvs(scale=1, size=300) + + x = np.linspace(1.5e-2, 5, 55) + + kde = KDE(data) + f = kde(output='plot', title='Ordinary KDE (hs={0:g})'.format(kde.hs)) + plt.figure(0) + f.plot() + + plt.plot(x, st.rayleigh.pdf(x, scale=1), ':') + + # plotnorm((data).^(L2)) # gives a straight line => L2 = 0.5 reasonable + + tkde = TKDE(data, L2=0.5) + ft = tkde(x, output='plot', + title='Transformation KDE (hs={0:g})'.format(tkde.tkde.hs)) + plt.figure(1) + ft.plot() + + plt.plot(x, st.rayleigh.pdf(x, scale=1), ':') + + plt.figure(0) + + +def kde_demo3(): + '''Demonstrate the difference between transformation and ordinary-KDE in 2D + + KDEDEMO3 shows that the transformation KDE is a better estimate for + Rayleigh distributed data around 0 than the ordinary KDE. + ''' + st = scipy.stats + data = st.rayleigh.rvs(scale=1, size=(2, 300)) + + # x = np.linspace(1.5e-3, 5, 55) + + kde = KDE(data) + f = kde(output='plot', title='Ordinary KDE', plotflag=1) + plt.figure(0) + f.plot() + + plt.plot(data[0], data[1], '.') + + # plotnorm((data).^(L2)) % gives a straight line => L2 = 0.5 reasonable + + tkde = TKDE(data, L2=0.5) + ft = tkde.eval_grid_fast( + output='plot', title='Transformation KDE', plotflag=1) + + plt.figure(1) + ft.plot() + + plt.plot(data[0], data[1], '.') + + plt.figure(0) + + +def kde_demo4(N=50): + '''Demonstrate that the improved Sheather-Jones plug-in (hisj) is superior + for 1D multimodal distributions + + KDEDEMO4 shows that the improved Sheather-Jones plug-in smoothing is a + better compared to normal reference rules (in this case the hns) + ''' + st = scipy.stats + + data = np.hstack((st.norm.rvs(loc=5, scale=1, size=(N,)), + st.norm.rvs(loc=-5, scale=1, size=(N,)))) + + # x = np.linspace(1.5e-3, 5, 55) + + kde = KDE(data, kernel=Kernel('gauss', 'hns')) + f = kde(output='plot', title='Ordinary KDE', plotflag=1) + + kde1 = KDE(data, kernel=Kernel('gauss', 'hisj')) + f1 = kde1(output='plot', label='Ordinary KDE', plotflag=1) + + plt.figure(0) + f.plot('r', label='hns={0:g}'.format(kde.hs)) + # plt.figure(2) + f1.plot('b', label='hisj={0:g}'.format(kde1.hs)) + x = np.linspace(-4, 4) + for loc in [-5, 5]: + plt.plot(x + loc, st.norm.pdf(x, 0, scale=1) / 2, 'k:', + label='True density') + plt.legend() + + +def kde_demo5(N=500): + '''Demonstrate that the improved Sheather-Jones plug-in (hisj) is superior + for 2D multimodal distributions + + KDEDEMO5 shows that the improved Sheather-Jones plug-in smoothing is better + compared to normal reference rules (in this case the hns) + ''' + st = scipy.stats + + data = np.hstack((st.norm.rvs(loc=5, scale=1, size=(2, N,)), + st.norm.rvs(loc=-5, scale=1, size=(2, N,)))) + kde = KDE(data, kernel=Kernel('gauss', 'hns')) + f = kde(output='plot', plotflag=1, + title='Ordinary KDE (hns={0:s}'.format(str(kde.hs.tolist()))) + + kde1 = KDE(data, kernel=Kernel('gauss', 'hisj')) + f1 = kde1(output='plot', plotflag=1, + title='Ordinary KDE (hisj={0:s})'.format(str(kde1.hs.tolist()))) + + plt.figure(0) + plt.clf() + f.plot() + plt.plot(data[0], data[1], '.') + plt.figure(1) + plt.clf() + f1.plot() + plt.plot(data[0], data[1], '.') + + +def kreg_demo1(hs=None, fast=False, fun='hisj'): + """""" + N = 100 + # ei = np.random.normal(loc=0, scale=0.075, size=(N,)) + ei = np.array([ + -0.08508516, 0.10462496, 0.07694448, -0.03080661, 0.05777525, + 0.06096313, -0.16572389, 0.01838912, -0.06251845, -0.09186784, + -0.04304887, -0.13365788, -0.0185279, -0.07289167, 0.02319097, + 0.06887854, -0.08938374, -0.15181813, 0.03307712, 0.08523183, + -0.0378058, -0.06312874, 0.01485772, 0.06307944, -0.0632959, + 0.18963205, 0.0369126, -0.01485447, 0.04037722, 0.0085057, + -0.06912903, 0.02073998, 0.1174351, 0.17599277, -0.06842139, + 0.12587608, 0.07698113, -0.0032394, -0.12045792, -0.03132877, + 0.05047314, 0.02013453, 0.04080741, 0.00158392, 0.10237899, + -0.09069682, 0.09242174, -0.15445323, 0.09190278, 0.07138498, + 0.03002497, 0.02495252, 0.01286942, 0.06449978, 0.03031802, + 0.11754861, -0.02322272, 0.00455867, -0.02132251, 0.09119446, + -0.03210086, -0.06509545, 0.07306443, 0.04330647, 0.078111, + -0.04146907, 0.05705476, 0.02492201, -0.03200572, -0.02859788, + -0.05893749, 0.00089538, 0.0432551, 0.04001474, 0.04888828, + -0.17708392, 0.16478644, 0.1171006, 0.11664846, 0.01410477, + -0.12458953, -0.11692081, 0.0413047, -0.09292439, -0.07042327, + 0.14119701, -0.05114335, 0.04994696, -0.09520663, 0.04829406, + -0.01603065, -0.1933216, 0.19352763, 0.11819496, 0.04567619, + -0.08348306, 0.00812816, -0.00908206, 0.14528945, 0.02901065]) + x = np.linspace(0, 1, N) + + y0 = 2 * np.exp(-x ** 2 / (2 * 0.3 ** 2)) + \ + 3 * np.exp(-(x - 1) ** 2 / (2 * 0.7 ** 2)) + y = y0 + ei + kernel = Kernel('gauss', fun=fun) + hopt = kernel.hisj(x) + kreg = KRegression( + x, y, p=0, hs=hs, kernel=kernel, xmin=-2 * hopt, xmax=1 + 2 * hopt) + if fast: + kreg.__call__ = kreg.eval_grid_fast + + f = kreg(output='plot', title='Kernel regression', plotflag=1) + plt.figure(0) + f.plot(label='p=0') + + kreg.p = 1 + f1 = kreg(output='plot', title='Kernel regression', plotflag=1) + f1.plot(label='p=1') + # print(f1.data) + plt.plot(x, y, '.', label='data') + plt.plot(x, y0, 'k', label='True model') + plt.legend() + + plt.show() + + print(kreg.tkde.tkde._inv_hs) + print(kreg.tkde.tkde.hs) + +_TINY = np.finfo(float).machar.tiny +_REALMIN = np.finfo(float).machar.xmin +_REALMAX = np.finfo(float).machar.xmax +_EPS = np.finfo(float).eps + + +def _logit(p): + pc = p.clip(min=0, max=1) + return (np.log(pc) - np.log1p(-pc)).clip(min=-40, max=40) + + +def _logitinv(x): + return 1.0 / (np.exp(-x) + 1) + + +def _get_data(n=100, symmetric=False, loc1=1.1, scale1=0.6, scale2=1.0): + st = scipy.stats + # from sg_filter import SavitzkyGolay + dist = st.norm + + norm1 = scale2 * (dist.pdf(-loc1, loc=-loc1, scale=scale1) + + dist.pdf(-loc1, loc=loc1, scale=scale1)) + + def fun1(x): + return ((dist.pdf(x, loc=-loc1, scale=scale1) + + dist.pdf(x, loc=loc1, scale=scale1)) / norm1).clip(max=1.0) + + x = np.sort(6 * np.random.rand(n, 1) - 3, axis=0) + + y = (fun1(x) > np.random.rand(n, 1)).ravel() + # y = (np.cos(x)>2*np.random.rand(n, 1)-1).ravel() + x = x.ravel() + + if symmetric: + xi = np.hstack((x.ravel(), -x.ravel())) + yi = np.hstack((y, y)) + i = np.argsort(xi) + x = xi[i] + y = yi[i] + return x, y, fun1 + + +def kreg_demo2(n=100, hs=None, symmetric=False, fun='hisj', plotlog=False): + x, y, fun1 = _get_data(n, symmetric) + kreg_demo3(x, y, fun1, hs=None, fun='hisj', plotlog=False) + + +def kreg_demo3(x, y, fun1, hs=None, fun='hisj', plotlog=False): + st = scipy.stats + + alpha = 0.1 + z0 = -_invnorm(alpha / 2) + + n = x.size + hopt, hs1, hs2 = _get_regression_smooting(x, y, fun='hos') + if hs is None: + hs = hopt + + forward = _logit + reverse = _logitinv + # forward = np.log + # reverse = np.exp + + xmin, xmax = x.min(), x.max() + ni = max(2 * int((xmax - xmin) / hopt) + 3, 5) + print(ni) + print(xmin, xmax) + sml = hopt * 0.1 + xi = np.linspace(xmin - sml, xmax + sml, ni) + xiii = np.linspace(xmin - sml, xmax + sml, 4 * ni + 1) + + c = gridcount(x, xi) + if (y == 1).any(): + c0 = gridcount(x[y == 1], xi) + else: + c0 = np.zeros(np.shape(xi)) + yi = np.where(c == 0, 0, c0 / c) + + kreg = KRegression(x, y, hs=hs, p=0) + fiii = kreg(xiii) + yiii = interpolate.interp1d(xi, yi)(xiii) + fit = fun1(xiii).clip(max=1.0) + df = np.diff(fiii) + eerr = np.abs((yiii - fiii)).std() + 0.5 * (df[:-1] * df[1:] < 0).sum() / n + err = (fiii - fit).std() + msg = '{} err={1:1.3f},eerr={2:1.3f}, n={:d}, hs={:1.3f}, hs1={:1.3f}, '\ + 'hs2={:1.3f}' + title = (msg.format(fun, err, eerr, n, hs, hs1, hs2)) + f = kreg(xiii, output='plotobj', title=title, plotflag=1) + + # yi[yi==0] = 1.0/(c[c!=0].min()+4) + # yi[yi==1] = 1-1.0/(c[c!=0].min()+4) + # yi[yi==0] = fi[yi==0] + # yi[yi==0] = np.exp(stineman_interp(xi[yi==0], xi[yi>0],np.log(yi[yi>0]))) + # yi[yi==0] = fun1(xi[yi==0]) + try: + yi[yi == 0] = yi[yi > 0].min() / sqrt(n) + except: + yi[yi == 0] = 1. / n + yi[yi == 1] = 1 - (1 - yi[yi < 1].max()) / sqrt(n) + + logity = forward(yi) + + gkreg = KRegression(xi, logity, hs=hs, xmin=xmin - hopt, xmax=xmax + hopt) + fg = gkreg.eval_grid( + xi, output='plotobj', title='Kernel regression', plotflag=1) + sa = (fg.data - logity).std() + sa2 = iqrange(fg.data - logity) / 1.349 + # print('sa=%g %g' % (sa, sa2)) + sa = min(sa, sa2) + +# plt.figure(1) +# plt.plot(xi, slogity-logity,'r.') +# plt.plot(xi, logity-,'b.') +# plt.plot(xi, fg.data-logity, 'b.') +# plt.show() +# return + + fg = gkreg.eval_grid( + xiii, output='plotobj', title='Kernel regression', plotflag=1) + pi = reverse(fg.data) + + dx = xi[1] - xi[0] + ckreg = KDE(x, hs=hs) + # ci = ckreg.eval_grid_fast(xi)*n*dx + ciii = ckreg.eval_grid_fast(xiii) * dx * x.size # n*(1+symmetric) + +# sa1 = np.sqrt(1./(ciii*pi*(1-pi))) +# plo3 = reverse(fg.data-z0*sa) +# pup3 = reverse(fg.data+z0*sa) + fg.data = pi + pi = f.data + + # ref Casella and Berger (1990) "Statistical inference" pp444 +# a = 2*pi + z0**2/(ciii+1e-16) +# b = 2*(1+z0**2/(ciii+1e-16)) +# plo2 = ((a-sqrt(a**2-2*pi**2*b))/b).clip(min=0,max=1) +# pup2 = ((a+sqrt(a**2-2*pi**2*b))/b).clip(min=0,max=1) + # Jeffreys intervall a=b=0.5 + # st.beta.isf(alpha/2, x+a, n-x+b) + ab = 0.07 # 0.055 + pi1 = pi # fun1(xiii) + pup2 = np.where(pi == 1, + 1, + st.beta.isf(alpha / 2, + ciii * pi1 + ab, + ciii * (1 - pi1) + ab)) + plo2 = np.where(pi == 0, + 0, + st.beta.isf(1 - alpha / 2, + ciii * pi1 + ab, + ciii * (1 - pi1) + ab)) + + averr = np.trapz(pup2 - plo2, xiii) / \ + (xiii[-1] - xiii[0]) + 0.5 * (df[:-1] * df[1:] < 0).sum() + + # f2 = kreg_demo4(x, y, hs, hopt) + # Wilson score + den = 1 + (z0 ** 2. / ciii) + xc = (pi1 + (z0 ** 2) / (2 * ciii)) / den + halfwidth = (z0 * sqrt((pi1 * (1 - pi1) / ciii) + + (z0 ** 2 / (4 * (ciii ** 2))))) / den + plo = (xc - halfwidth).clip(min=0) # wilson score + pup = (xc + halfwidth).clip(max=1.0) # wilson score + # pup = (pi + z0*np.sqrt(pi*(1-pi)/ciii)).clip(min=0,max=1) # dont use + # plo = (pi - z0*np.sqrt(pi*(1-pi)/ciii)).clip(min=0,max=1) + + # mi = kreg.eval_grid(x) + # sigma = (stineman_interp(x, xiii, pup)-stineman_interp(x, xiii, plo))/4 + # aic = np.abs((y-mi)/sigma).std()+ 0.5*(df[:-1]*df[1:]<0).sum()/n + # aic = np.abs((yiii-fiii)/(pup-plo)).std() + \ + # 0.5*(df[:-1]*df[1:]<0).sum() + \ + # ((yiii-pup).clip(min=0)-(yiii-plo).clip(max=0)).sum() + + k = (df[:-1] * df[1:] < 0).sum() # numpeaks + sigmai = (pup - plo) + aic = (((yiii - fiii) / sigmai) ** 2).sum() + \ + 2 * k * (k + 1) / np.maximum(ni - k + 1, 1) + \ + np.abs((yiii - pup).clip(min=0) - (yiii - plo).clip(max=0)).sum() + + # aic = (((yiii-fiii)/sigmai)**2).sum()+ 2*k*(k+1)/(ni-k+1) + \ + # np.abs((yiii-pup).clip(min=0)-(yiii-plo).clip(max=0)).sum() + + # aic = averr + ((yiii-pup).clip(min=0)-(yiii-plo).clip(max=0)).sum() + + fg.plot(label='KReg grid aic={:2.3f}'.format(aic)) + f.plot(label='KReg averr={:2.3f} '.format(averr)) + labtxt = '%d CI' % (int(100 * (1 - alpha))) + plt.fill_between(xiii, pup, plo, alpha=0.20, + color='r', linestyle='--', label=labtxt) + plt.fill_between(xiii, pup2, plo2, alpha=0.20, color='b', linestyle=':', + label='{:d} CI2'.format(int(100 * (1 - alpha)))) + plt.plot(xiii, fun1(xiii), 'r', label='True model') + plt.scatter(xi, yi, label='data') + print('maxp = {:g}'.format(np.nanmax(f.data))) + print('hs = {:g}'.format(kreg.tkde.tkde.hs)) + plt.legend() + h = plt.gca() + if plotlog: + plt.setp(h, yscale='log') + # plt.show() + return hs1, hs2 + + +def kreg_demo4(x, y, hs, hopt, alpha=0.05): + st = scipy.stats + + n = x.size + xmin, xmax = x.min(), x.max() + ni = max(2 * int((xmax - xmin) / hopt) + 3, 5) + + sml = hopt * 0.1 + xi = np.linspace(xmin - sml, xmax + sml, ni) + xiii = np.linspace(xmin - sml, xmax + sml, 4 * ni + 1) + + kreg = KRegression(x, y, hs=hs, p=0) + + dx = xi[1] - xi[0] + ciii = kreg.tkde.eval_grid_fast(xiii) * dx * x.size +# ckreg = KDE(x,hs=hs) +# ciiii = ckreg.eval_grid_fast(xiii)*dx* x.size #n*(1+symmetric) + + f = kreg(xiii, output='plotobj') # , plot_kwds=dict(plotflag=7)) + pi = f.data + + # Jeffreys intervall a=b=0.5 + # st.beta.isf(alpha/2, x+a, n-x+b) + ab = 0.07 # 0.5 + pi1 = pi + pup = np.where(pi1 == 1, 1, st.beta.isf( + alpha / 2, ciii * pi1 + ab, ciii * (1 - pi1) + ab)) + plo = np.where(pi1 == 0, 0, st.beta.isf( + 1 - alpha / 2, ciii * pi1 + ab, ciii * (1 - pi1) + ab)) + + # Wilson score + # z0 = -_invnorm(alpha/2) +# den = 1+(z0**2./ciii); +# xc=(pi1+(z0**2)/(2*ciii))/den; +# halfwidth=(z0*sqrt((pi1*(1-pi1)/ciii)+(z0**2/(4*(ciii**2)))))/den +# plo2 = (xc-halfwidth).clip(min=0) # wilson score +# pup2 = (xc+halfwidth).clip(max=1.0) # wilson score + # f.dataCI = np.vstack((plo,pup)).T + f.prediction_error_avg = np.trapz(pup - plo, xiii) / (xiii[-1] - xiii[0]) + fiii = f.data + + c = gridcount(x, xi) + if (y == 1).any(): + c0 = gridcount(x[y == 1], xi) + else: + c0 = np.zeros(np.shape(xi)) + yi = np.where(c == 0, 0, c0 / c) + + f.children = [PlotData([plo, pup], xiii, plotmethod='fill_between', + plot_kwds=dict(alpha=0.2, color='r')), + PlotData(yi, xi, plotmethod='scatter', + plot_kwds=dict(color='r', s=5))] + + yiii = interpolate.interp1d(xi, yi)(xiii) + df = np.diff(fiii) + k = (df[:-1] * df[1:] < 0).sum() # numpeaks + sigmai = (pup - plo) + aicc = (((yiii - fiii) / sigmai) ** 2).sum() + \ + 2 * k * (k + 1) / np.maximum(ni - k + 1, 1) + \ + np.abs((yiii - pup).clip(min=0) - (yiii - plo).clip(max=0)).sum() + + f.aicc = aicc + f.labels.title = ('perr={:1.3f},aicc={:1.3f}, n={:d}, ' + 'hs={:1.3f}'.format(f.prediction_error_avg, aicc, n, hs)) + + return f + + +def check_kreg_demo3(): + + plt.ion() + k = 0 + for n in [50, 100, 300, 600, 4000]: + x, y, fun1 = _get_data( + n, symmetric=True, loc1=1.0, scale1=0.6, scale2=1.25) + k0 = k + + for fun in ['hste', ]: + hsmax, _hs1, _hs2 = _get_regression_smooting(x, y, fun=fun) + for hi in np.linspace(hsmax * 0.25, hsmax, 9): + plt.figure(k) + k += 1 + unused = kreg_demo3(x, y, fun1, hs=hi, fun=fun, plotlog=False) + + # kreg_demo2(n=n,symmetric=True,fun='hste', plotlog=False) + fig.tile(range(k0, k)) + plt.ioff() + plt.show() + + +def check_kreg_demo4(): + plt.ion() + # test_docstrings() + # kde_demo2() + # kreg_demo1(fast=True) + # kde_gauss_demo() + # kreg_demo2(n=120,symmetric=True,fun='hste', plotlog=True) + k = 0 + for _i, n in enumerate([100, 300, 600, 4000]): + x, y, fun1 = _get_data( + n, symmetric=True, loc1=0.1, scale1=0.6, scale2=0.75) + # k0 = k + hopt1, _h1, _h2 = _get_regression_smooting(x, y, fun='hos') + hopt2, _h1, _h2 = _get_regression_smooting(x, y, fun='hste') + hopt = sqrt(hopt1 * hopt2) + # hopt = _get_regression_smooting(x,y,fun='hos')[0] + for _j, fun in enumerate(['hste']): # , 'hisj', 'hns', 'hstt' + hsmax, _hs1, _hs2 = _get_regression_smooting(x, y, fun=fun) + + fmax = kreg_demo4(x, y, hsmax + 0.1, hopt) + for hi in np.linspace(hsmax * 0.1, hsmax, 55): + f = kreg_demo4(x, y, hi, hopt) + if f.aicc <= fmax.aicc: + fmax = f + plt.figure(k) + k += 1 + fmax.plot() + plt.plot(x, fun1(x), 'r') + + # kreg_demo2(n=n,symmetric=True,fun='hste', plotlog=False) + fig.tile(range(0, k)) + plt.ioff() + plt.show() + + +def check_regression_bin(): + plt.ion() + # test_docstrings() + # kde_demo2() + # kreg_demo1(fast=True) + # kde_gauss_demo() + # kreg_demo2(n=120,symmetric=True,fun='hste', plotlog=True) + k = 0 + for _i, n in enumerate([100, 300, 600, 4000]): + x, y, fun1 = _get_data( + n, symmetric=True, loc1=0.1, scale1=0.6, scale2=0.75) + fbest = regressionbin(x, y, alpha=0.05, color='g', label='Transit_D') + + figk = plt.figure(k) + ax = figk.gca() + k += 1 + fbest.labels.title = 'N = {:d}'.format(n) + fbest.plot(axis=ax) + ax.plot(x, fun1(x), 'r') + ax.legend(frameon=False, markerscale=4) + # ax = plt.gca() + ax.set_yticklabels(ax.get_yticks() * 100.0) + ax.grid(True) + + fig.tile(range(0, k)) + plt.ioff() + plt.show() + + +def check_bkregression(): + plt.ion() + k = 0 + for _i, n in enumerate([50, 100, 300, 600]): + x, y, fun1 = _get_data( + n, symmetric=True, loc1=0.1, scale1=0.6, scale2=0.75) + bkreg = BKRegression(x, y) + fbest = bkreg.prb_search_best( + hsfun='hste', alpha=0.05, color='g', label='Transit_D') + + figk = plt.figure(k) + ax = figk.gca() + k += 1 +# fbest.score.plot(axis=ax) +# axsize = ax.axis() +# ax.vlines(fbest.hs,axsize[2]+1,axsize[3]) +# ax.set(yscale='log') + fbest.labels.title = 'N = {:d}'.format(n) + fbest.plot(axis=ax) + ax.plot(x, fun1(x), 'r') + ax.legend(frameon=False, markerscale=4) + # ax = plt.gca() + ax.set_yticklabels(ax.get_yticks() * 100.0) + ax.grid(True) + + fig.tile(range(0, k)) + plt.ioff() + plt.show() + + +def _get_regression_smooting(x, y, fun='hste'): + hs1 = Kernel('gauss', fun=fun).get_smoothing(x) + # hx = np.median(np.abs(x-np.median(x)))/0.6745*(4.0/(3*n))**0.2 + if (y == 1).any(): + hs2 = Kernel('gauss', fun=fun).get_smoothing(x[y == 1]) + # hy = np.median(np.abs(y-np.mean(y)))/0.6745*(4.0/(3*n))**0.2 + else: + hs2 = 4 * hs1 + # hy = 4*hx + + # hy2 = Kernel('gauss', fun=fun).get_smoothing(y) + # kernel = Kernel('gauss',fun=fun) + # hopt = (hs1+2*hs2)/3 + # hopt = (hs1+4*hs2)/5 #kernel.get_smoothing(x) + # hopt = hs2 + hopt = sqrt(hs1 * hs2) + return hopt, hs1, hs2 + + +def empirical_bin_prb(x, y, hopt, color='r'): + """Returns empirical binomial probabiltity. + + Parameters + ---------- + x : ndarray + position ve + y : ndarray + binomial response variable (zeros and ones) + + Returns + ------- + P(x) : PlotData object + empirical probability + + """ + xmin, xmax = x.min(), x.max() + ni = max(2 * int((xmax - xmin) / hopt) + 3, 5) + + sml = hopt # *0.1 + xi = np.linspace(xmin - sml, xmax + sml, ni) + + c = gridcount(x, xi) + if (y == 1).any(): + c0 = gridcount(x[y == 1], xi) + else: + c0 = np.zeros(np.shape(xi)) + yi = np.where(c == 0, 0, c0 / c) + return PlotData(yi, xi, plotmethod='scatter', + plot_kwds=dict(color=color, s=5)) + + +def smoothed_bin_prb(x, y, hs, hopt, alpha=0.05, color='r', label='', + bin_prb=None): + ''' + Parameters + ---------- + x,y + hs : smoothing parameter + hopt : spacing in empirical_bin_prb + alpha : confidence level + color : color of plot object + bin_prb : PlotData object with empirical bin prb + ''' + if bin_prb is None: + bin_prb = empirical_bin_prb(x, y, hopt, color) + + xi = bin_prb.args + yi = bin_prb.data + ni = len(xi) + dxi = xi[1] - xi[0] + + n = x.size + + xiii = np.linspace(xi[0], xi[-1], 10 * ni + 1) + + kreg = KRegression(x, y, hs=hs, p=0) + # expected number of data in each bin + ciii = kreg.tkde.eval_grid_fast(xiii) * dxi * n + + f = kreg(xiii, output='plotobj') # , plot_kwds=dict(plotflag=7)) + pi = f.data + + st = scipy.stats + # Jeffreys intervall a=b=0.5 + # st.beta.isf(alpha/2, x+a, n-x+b) + ab = 0.07 # 0.5 + pi1 = pi + pup = np.where(pi1 == 1, 1, st.beta.isf( + alpha / 2, ciii * pi1 + ab, ciii * (1 - pi1) + ab)) + plo = np.where(pi1 == 0, 0, st.beta.isf( + 1 - alpha / 2, ciii * pi1 + ab, ciii * (1 - pi1) + ab)) + + # Wilson score + # z0 = -_invnorm(alpha/2) +# den = 1+(z0**2./ciii); +# xc=(pi1+(z0**2)/(2*ciii))/den; +# halfwidth=(z0*sqrt((pi1*(1-pi1)/ciii)+(z0**2/(4*(ciii**2)))))/den +# plo2 = (xc-halfwidth).clip(min=0) # wilson score +# pup2 = (xc+halfwidth).clip(max=1.0) # wilson score + # f.dataCI = np.vstack((plo,pup)).T + f.prediction_error_avg = np.trapz(pup - plo, xiii) / (xiii[-1] - xiii[0]) + fiii = f.data + + f.plot_kwds['color'] = color + f.plot_kwds['linewidth'] = 2 + if label: + f.plot_kwds['label'] = label + f.children = [PlotData([plo, pup], xiii, plotmethod='fill_between', + plot_kwds=dict(alpha=0.2, color=color)), + bin_prb] + + yiii = interpolate.interp1d(xi, yi)(xiii) + df = np.diff(fiii) + k = (df[:-1] * df[1:] < 0).sum() # numpeaks + sigmai = (pup - plo) + aicc = (((yiii - fiii) / sigmai) ** 2).sum() + \ + 2 * k * (k + 1) / np.maximum(ni - k + 1, 1) + \ + np.abs((yiii - pup).clip(min=0) - (yiii - plo).clip(max=0)).sum() + + f.aicc = aicc + f.fun = kreg + f.labels.title = ('perr={:1.3f},aicc={:1.3f}, n={:d}, ' + 'hs={:1.3f}'.format(f.prediction_error_avg, aicc, n, hs)) + + return f + + +def regressionbin(x, y, alpha=0.05, color='r', label=''): + """Return kernel regression estimate for binomial data. + + Parameters + ---------- + x : arraylike + positions + y : arraylike + of 0 and 1 + + """ + + hopt1, _h1, _h2 = _get_regression_smooting(x, y, fun='hos') + hopt2, _h1, _h2 = _get_regression_smooting(x, y, fun='hste') + hopt = sqrt(hopt1 * hopt2) + + fbest = smoothed_bin_prb(x, y, hopt2 + 0.1, hopt, alpha, color, label) + bin_prb = fbest.children[-1] + for fun in ['hste']: # , 'hisj', 'hns', 'hstt' + hsmax, _hs1, _hs2 = _get_regression_smooting(x, y, fun=fun) + for hi in np.linspace(hsmax * 0.1, hsmax, 55): + f = smoothed_bin_prb(x, y, hi, hopt, alpha, color, label, bin_prb) + if f.aicc <= fbest.aicc: + fbest = f + # hbest = hi + return fbest + + +def kde_gauss_demo(n=50): + """KDEDEMO Demonstrate the KDEgauss. + + KDEDEMO1 shows the true density (dotted) compared to KDE based on 7 + observations (solid) and their individual kernels (dashed) for 3 + different values of the smoothing parameter, hs. + + """ + + st = scipy.stats + # x = np.linspace(-4, 4, 101) + # data = np.random.normal(loc=0, scale=1.0, size=n) + # data = np.random.exponential(scale=1.0, size=n) +# n1 = 128 +# I = (np.arange(n1)*pi)**2 *0.01*0.5 +# kw = exp(-I) +# plt.plot(idctn(kw)) +# return + dist = st.norm + # dist = st.expon + data = dist.rvs(loc=0, scale=1.0, size=n) + d, _N = np.atleast_2d(data).shape + + if d == 1: + plot_options = [dict(color='red', label='KDE hste'), + dict(color='green', label='TKDE hisj'), + dict(color='black', label='KDEgauss hste')] + else: + plot_options = [dict(colors='red'), dict(colors='green'), + dict(colors='black')] + + plt.figure(1) + t0 = time.time() + kde0 = KDE(data, kernel=Kernel('gauss', 'hste')) + f0 = kde0.eval_grid_fast(output='plot', ylab='Density', r=0) + t1 = time.time() + total1 = t1-t0 + + f0.plot('.', **plot_options[0]) + if dist.name != 'norm': + kde1 = TKDE(data, kernel=Kernel('gauss', 'hisj'), L2=.5) + f1 = kde1.eval_grid_fast(output='plot', ylab='Density', r=0) + f1.plot(**plot_options[1]) + else: + kde1 = kde0 + f1 = f0 + t1 = time.time() + kde2 = KDEgauss(data) + f2 = kde2(output='plot', ylab='Density', r=0) + t2 = time.time() + total2 = t2-t1 + + x = f2.args + f2.plot(**plot_options[2]) + + fmax = dist.pdf(x, 0, 1).max() + if d == 1: + plt.plot(x, dist.pdf(x, 0, 1), 'k:', label='True pdf') + plt.axis([x.min(), x.max(), 0, fmax]) + plt.legend() + plt.show() + print(fmax / f2.data.max()) + try: + print('hs0={:s} hs1={:s} hs2={:s}'.format(str(kde0.hs.tolist()), + str(kde1.tkde.hs.tolist()), + str(kde2.hs.tolist()))) + except: + pass + print('inc0 = {:d}, inc1 = {:d}, inc2 = {:d}'.format(kde0.inc, kde1.inc, + kde2.inc)) + print(np.trapz(f0.data, f0.args), np.trapz(f2.data, f2.args)) + print(total1, total2) + + +def test_kde(): + data = np.array([ + 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887, + 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, + 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, + 1.8919469, 0.72433808, 1.92973094, 0.44749838, 1.36508452]) + + x = np.linspace(0.01, max(data + 1), 10) + kde = TKDE(data, hs=0.5, L2=0.5) + _f = kde(x) + # f = array([1.03982714, 0.45839018, 0.39514782, 0.32860602, 0.26433318, + # 0.20717946, 0.15907684, 0.1201074 , 0.08941027, 0.06574882]) + + _f1 = kde.eval_grid(x) + # array([ 1.03982714, 0.45839018, 0.39514782, 0.32860602, 0.26433318, + # 0.20717946, 0.15907684, 0.1201074 , 0.08941027, 0.06574882]) + + _f2 = kde.eval_grid_fast(x) + # array([ 1.06437223, 0.46203314, 0.39593137, 0.32781899, 0.26276433, + # 0.20532206, 0.15723498, 0.11843998, 0.08797755, 0. ]) + + +if __name__ == '__main__': + if True: + test_docstrings(__file__) + else: + # test_kde() + # check_bkregression() + # check_regression_bin() + # check_kreg_demo3() + # check_kreg_demo4() + + # kde_demo2() + # kreg_demo1(fast=True) + kde_gauss_demo(n=50) + # kreg_demo2(n=120,symmetric=True,fun='hste', plotlog=True) + plt.show('hold') diff --git a/wafo/kdetools/kernels.py b/wafo/kdetools/kernels.py new file mode 100644 index 0000000..94e013c --- /dev/null +++ b/wafo/kdetools/kernels.py @@ -0,0 +1,1382 @@ +''' +Created on 15. des. 2016 + +@author: pab +''' +from __future__ import division +from abc import ABCMeta, abstractmethod +import warnings +import numpy as np +from numpy import pi, sqrt, exp, percentile +from scipy import optimize, linalg +from scipy.special import gamma +from wafo.misc import tranproc # , trangood +from wafo.kdetools.gridding import gridcount +from wafo.dctpack import dct +from wafo.testing import test_docstrings + +__all__ = ['Kernel', 'sphere_volume', 'qlevels', 'iqrange', 'percentile'] + + +def _assert(cond, msg): + if not cond: + raise ValueError(msg) + +# stats = (mu2, R, Rdd) where +# mu2 : 2'nd order moment, i.e.,int(x^2*kernel(x)) +# R : integral of squared kernel, i.e., int(kernel(x)^2) +# Rdd : int( (kernel''(x))^2 ). +_stats_epan = (1. / 5, 3. / 5, np.inf) +_stats_biwe = (1. / 7, 5. / 7, 45. / 2) +_stats_triw = (1. / 9, 350. / 429, np.inf) +_stats_rect = (1. / 3, 1. / 2, np.inf) +_stats_tria = (1. / 6, 2. / 3, np.inf) +_stats_lapl = (2, 1. / 4, np.inf) +_stats_logi = (pi ** 2 / 3, 1. / 6, 1 / 42) +_stats_gaus = (1, 1. / (2 * sqrt(pi)), 3. / (8 * sqrt(pi))) + + +def qlevels(pdf, p=(10, 30, 50, 70, 90, 95, 99, 99.9), x1=None, x2=None): + """QLEVELS Calculates quantile levels which encloses P% of PDF. + + CALL: [ql PL] = qlevels(pdf,PL,x1,x2); + + ql = the discrete quantile levels. + pdf = joint point density function matrix or vector + PL = percent level (default [10:20:90 95 99 99.9]) + x1,x2 = vectors of the spacing of the variables + (Default unit spacing) + + QLEVELS numerically integrates PDF by decreasing height and find the + quantile levels which encloses P% of the distribution. If X1 and + (or) X2 is unspecified it is assumed that dX1 and dX2 is constant. + NB! QLEVELS normalizes the integral of PDF to N/(N+0.001) before + calculating QL in order to reflect the sampling of PDF is finite. + Currently only able to handle 1D and 2D PDF's if dXi is not constant + (i=1,2). + + Example + ------- + >>> import wafo.stats as ws + >>> x = np.linspace(-8,8,2001); + >>> PL = np.r_[10:90:20, 90, 95, 99, 99.9] + >>> qlevels(ws.norm.pdf(x),p=PL, x1=x); + array([ 0.39591707, 0.37058719, 0.31830968, 0.23402133, 0.10362052, + 0.05862129, 0.01449505, 0.00178806]) + + # compared with the exact values + >>> ws.norm.pdf(ws.norm.ppf((100-PL)/200)) + array([ 0.39580488, 0.370399 , 0.31777657, 0.23315878, 0.10313564, + 0.05844507, 0.01445974, 0.00177719]) + + See also + -------- + qlevels2, tranproc + + """ + + norm = 1 # normalize cdf to unity + pdf = np.atleast_1d(pdf) + _assert(not any(pdf.ravel() < 0), 'This is not a pdf since one or more ' + 'values of pdf is negative') + + fsiz = pdf.shape + fsizmin = min(fsiz) + if fsizmin == 0: + return [] + + N = np.prod(fsiz) + d = len(fsiz) + if x1 is None or ((x2 is None) and d > 2): + fdfi = pdf.ravel() + else: + if d == 1: # pdf in one dimension + dx22 = np.ones(1) + else: # % pdf in two dimensions + dx2 = np.diff(x2.ravel()) * 0.5 + dx22 = np.r_[0, dx2] + np.r_[dx2, 0] + + dx1 = np.diff(x1.ravel()) * 0.5 + dx11 = np.r_[0, dx1] + np.r_[dx1, 0] + dx1x2 = dx22[:, None] * dx11 + fdfi = (pdf * dx1x2).ravel() + + p = np.atleast_1d(p) + _assert(not np.any((p < 0) | (100 < p)), 'PL must satisfy 0 <= PL <= 100') + + p2 = p / 100.0 + ind = np.argsort(pdf.ravel()) # sort by height of pdf + ind = ind[::-1] + fi = pdf.flat[ind] + + # integration in the order of decreasing height of pdf + Fi = np.cumsum(fdfi[ind]) + + if norm: # normalize Fi to make sure int pdf dx1 dx2 approx 1 + Fi = Fi / Fi[-1] * N / (N + 1.5e-8) + + maxFi = np.max(Fi) + if maxFi > 1: + warnings.warn('this is not a pdf since cdf>1! normalizing') + + Fi = Fi / Fi[-1] * N / (N + 1.5e-8) + + elif maxFi < .95: + msg = '''The given pdf is too sparsely sampled since cdf<.95. + Thus QL is questionable''' + warnings.warn(msg) + + # make sure Fi is strictly increasing by not considering duplicate values + ind, = np.where(np.diff(np.r_[Fi, 1]) > 0) + # calculating the inverse of Fi to find the index + ui = tranproc(Fi[ind], fi[ind], p2) + + if np.any(ui >= max(pdf.ravel())): + warnings.warn('The lowest percent level is too close to 0%') + + if np.any(ui <= min(pdf.ravel())): + msg = '''The given pdf is too sparsely sampled or + the highest percent level is too close to 100%''' + warnings.warn(msg) + ui[ui < 0] = 0.0 + + return ui + + +def qlevels2(data, p=(10, 30, 50, 70, 90, 95, 99, 99.9), method=1): + """QLEVELS2 Calculates quantile levels which encloses P% of data. + + CALL: [ql PL] = qlevels2(data,PL,method); + + ql = the discrete quantile levels, size D X Np + Parameters + ---------- + data : data matrix, size D x N (D = # of dimensions) + p : percent level vector, length Np (default [10:20:90 95 99 99.9]) + method : integer + 1 Interpolation so that F(X_[k]) == k/(n-1). (linear default) + 2 Interpolation so that F(X_[k]) == (k+0.5)/n. (midpoint) + 3 Interpolation so that F(X_[k]) == (k+1)/n. (lower) + 4 Interpolation so that F(X_[k]) == k/n. (higher) + + Returns + ------- + + QLEVELS2 sort the columns of data in ascending order and find the + quantile levels for each column which encloses P% of the data. + + Examples : Finding quantile levels enclosing P% of data: + -------- + >>> import wafo.stats as ws + >>> PL = np.r_[10:90:20, 90, 95, 99, 99.9] + >>> xs = ws.norm.rvs(size=2500000) + >>> np.allclose(qlevels2(ws.norm.pdf(xs), p=PL), + ... [0.3958, 0.3704, 0.3179, 0.2331, 0.1031, 0.05841, 0.01451, 0.001751], + ... rtol=1e-1) + True + + # compared with the exact values + >>> ws.norm.pdf(ws.norm.ppf((100-PL)/200)) + array([ 0.39580488, 0.370399 , 0.31777657, 0.23315878, 0.10313564, + 0.05844507, 0.01445974, 0.00177719]) + + # Finding the median of xs: + >>> '%2.2f' % np.abs(qlevels2(xs,50)[0]) + '0.00' + + See also + -------- + qlevels + + """ + _assert(0 < method < 5, + 'Method must be between 1 to 4. Got method={}.'.format(method)) + interpolation = ['', 'linear', 'midpoint', 'lower', 'higher'][method] + q = 100 - np.atleast_1d(p) + return percentile(data, q, axis=-1, interpolation=interpolation) + + +def iqrange(data, axis=None): + """Returns the Inter Quartile Range of data. + + Parameters + ---------- + data : array-like + Input array or object that can be converted to an array. + axis : {None, int}, optional + Axis along which the percentiles are computed. The default (axis=None) + is to compute the median along a flattened version of the array. + + Returns + ------- + r : array-like + abs(np.percentile(data, 75, axis)-np.percentile(data, 25, axis)) + + Notes + ----- + IQRANGE is a robust measure of spread. The use of interquartile range + guards against outliers if the distribution have heavy tails. + + Example + ------- + >>> a = np.arange(101) + >>> iqrange(a) + 50.0 + + See also + -------- + np.std + + """ + return np.abs(np.percentile(data, 75, axis=axis) - + np.percentile(data, 25, axis=axis)) + + +def sphere_volume(d, r=1.0): + """ + Returns volume of d-dimensional sphere with radius r + + Parameters + ---------- + d : scalar or array_like + dimension of sphere + r : scalar or array_like + radius of sphere (default 1) + + Example + ------- + >>> sphere_volume(2., r=2.) + 12.566370614359172 + >>> sphere_volume(2., r=1.) + 3.1415926535897931 + + Reference + --------- + Wand,M.P. and Jones, M.C. (1995) + 'Kernel smoothing' + Chapman and Hall, pp 105 + """ + return (r ** d) * 2.0 * pi ** (d / 2.0) / (d * gamma(d / 2.0)) + + +class _Kernel(object): + __metaclass__ = ABCMeta + + def __init__(self, r=1.0, stats=None): + self.r = r # radius of kernel + self.stats = stats + + def norm_factor(self, d=1, n=None): + _assert(0 < d, "D") + _assert(0 < n, "Number of samples too few (n={})".format(n)) + return 1.0 + + @abstractmethod + def _kernel(self, x): + pass + + def norm_kernel(self, x): + X = np.atleast_2d(x) + return self._kernel(X) / self.norm_factor(*X.shape) + + def kernel(self, x): + return self._kernel(np.atleast_2d(x)) + + def deriv4_6_8_10(self, t, numout=4): + raise NotImplementedError('Method not implemented for this kernel!') + + def effective_support(self): + """Return the effective support of kernel. + + The kernel must be symmetric and compactly supported on [-tau tau] + if the kernel has infinite support then the kernel must have the + effective support in [-tau tau], i.e., be negligible outside the range + + """ + return self._effective_support() + + def _effective_support(self): + return -self.r, self.r + __call__ = kernel + + +class _KernelMulti(_Kernel): + """ + p=0; Sphere = rect for 1D + p=1; Multivariate Epanechnikov kernel. + p=2; Multivariate Bi-weight Kernel + p=3; Multi variate Tri-weight Kernel + p=4; Multi variate Four-weight Kernel + """ + def __init__(self, r=1.0, p=1, stats=None): + self.p = p + super(_KernelMulti, self).__init__(r, stats) + + def norm_factor(self, d=1, n=None): + r = self.r + p = self.p + c = 2 ** p * np.prod(np.r_[1:p + 1]) * sphere_volume(d, r) / np.prod( + np.r_[(d + 2):(2 * p + d + 1):2]) # normalizing constant + return c + + def _kernel(self, x): + r = self.r + p = self.p + x2 = x ** 2 + return ((1.0 - x2.sum(axis=0) / r ** 2).clip(min=0.0)) ** p + +mkernel_epanechnikov = _KernelMulti(p=1, stats=_stats_epan) +mkernel_biweight = _KernelMulti(p=2, stats=_stats_biwe) +mkernel_triweight = _KernelMulti(p=3, stats=_stats_triw) + + +class _KernelProduct(_KernelMulti): + """ + p=0; rectangular + p=1; 1D product Epanechnikov kernel. + p=2; 1D product Bi-weight Kernel + p=3; 1D product Tri-weight Kernel + p=4; 1D product Four-weight Kernel + """ + def norm_factor(self, d=1, n=None): + r = self.r + p = self.p + c = (2 ** p * np.prod(np.r_[1:p + 1]) * sphere_volume(1, r) / + np.prod(np.r_[(1 + 2):(2 * p + 2):2])) + return c ** d + + def _kernel(self, x): + r = self.r # radius + pdf = (1 - (x / r) ** 2).clip(min=0.0) ** self.p + return pdf.prod(axis=0) + +mkernel_p1epanechnikov = _KernelProduct(p=1, stats=_stats_epan) +mkernel_p1biweight = _KernelProduct(p=2, stats=_stats_biwe) +mkernel_p1triweight = _KernelProduct(p=3, stats=_stats_triw) + + +class _KernelRectangular(_Kernel): + + def _kernel(self, x): + return np.where(np.all(np.abs(x) <= self.r, axis=0), 1, 0.0) + + def norm_factor(self, d=1, n=None): + r = self.r + return (2 * r) ** d +mkernel_rectangular = _KernelRectangular(stats=_stats_rect) + + +class _KernelTriangular(_Kernel): + + def _kernel(self, x): + pdf = (1 - np.abs(x)).clip(min=0.0) + return pdf.prod(axis=0) +mkernel_triangular = _KernelTriangular(stats=_stats_tria) + + +class _KernelGaussian(_Kernel): + + def _kernel(self, x): + sigma = self.r / 4.0 + x2 = (x / sigma) ** 2 + return exp(-0.5 * x2.sum(axis=0)) + + def norm_factor(self, d=1, n=None): + sigma = self.r / 4.0 + return (2 * pi * sigma) ** (d / 2.0) + + def deriv4_6_8_10(self, t, numout=4): + """Returns 4th, 6th, 8th and 10th derivatives of the kernel + function.""" + phi0 = exp(-0.5 * t ** 2) / sqrt(2 * pi) + p4 = [1, 0, -6, 0, +3] + p4val = np.polyval(p4, t) * phi0 + if numout == 1: + return p4val + out = [p4val] + pn = p4 + for _i in range(numout - 1): + pnp1 = np.polyadd(-np.r_[pn, 0], np.polyder(pn)) + pnp2 = np.polyadd(-np.r_[pnp1, 0], np.polyder(pnp1)) + out.append(np.polyval(pnp2, t) * phi0) + pn = pnp2 + return out + +mkernel_gaussian = _KernelGaussian(r=4.0, stats=_stats_gaus) + +# def mkernel_gaussian(X): +# x2 = X ** 2 +# d = X.shape[0] +# return (2 * pi) ** (-d / 2) * exp(-0.5 * x2.sum(axis=0)) + + +class _KernelLaplace(_Kernel): + + def _kernel(self, x): + absX = np.abs(x) + return exp(-absX.sum(axis=0)) + + def norm_factor(self, d=1, n=None): + return 2 ** d +mkernel_laplace = _KernelLaplace(r=7.0, stats=_stats_lapl) + + +class _KernelLogistic(_Kernel): + + def _kernel(self, x): + s = exp(x) + return np.prod(s / (s + 1) ** 2, axis=0) +mkernel_logistic = _KernelLogistic(r=7.0, stats=_stats_logi) + +_MKERNEL_DICT = dict( + epan=mkernel_epanechnikov, + biwe=mkernel_biweight, + triw=mkernel_triweight, + p1ep=mkernel_p1epanechnikov, + p1bi=mkernel_p1biweight, + p1tr=mkernel_p1triweight, + rect=mkernel_rectangular, + tria=mkernel_triangular, + lapl=mkernel_laplace, + logi=mkernel_logistic, + gaus=mkernel_gaussian +) +_KERNEL_EXPONENT_DICT = dict( + re=0, sp=0, ep=1, bi=2, tr=3, fo=4, fi=5, si=6, se=7) + + +class Kernel(object): + + """Multivariate kernel. + + Parameters + ---------- + name : string + defining the kernel. Valid options are: + 'epanechnikov' - Epanechnikov kernel. + 'biweight' - Bi-weight kernel. + 'triweight' - Tri-weight kernel. + 'p1epanechnikov' - product of 1D Epanechnikov kernel. + 'p1biweight' - product of 1D Bi-weight kernel. + 'p1triweight' - product of 1D Tri-weight kernel. + 'triangular' - Triangular kernel. + 'gaussian' - Gaussian kernel + 'rectangular' - Rectangular kernel. + 'laplace' - Laplace kernel. + 'logistic' - Logistic kernel. + Note that only the first 4 letters of the kernel name is needed. + + Examples + -------- + N = 20 + data = np.random.rayleigh(1, size=(N,)) + >>> data = np.array([ + ... 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887, + ... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, + ... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, + ... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452]) + + >>> import wafo.kdetools as wk + >>> gauss = wk.Kernel('gaussian') + >>> gauss.stats() + (1, 0.28209479177387814, 0.21157109383040862) + >>> np.allclose(gauss.hscv(data), 0.21779575) + True + >>> np.allclose(gauss.hstt(data), 0.16341135) + True + >>> np.allclose(gauss.hste(data), 0.19179399) + True + >>> np.allclose(gauss.hldpi(data), 0.22502733) + True + >>> wk.Kernel('laplace').stats() + (2, 0.25, inf) + + >>> triweight = wk.Kernel('triweight') + >>> np.allclose(triweight.stats(), + ... (0.1111111111111111, 0.81585081585081587, np.inf)) + True + >>> np.allclose(triweight(np.linspace(-1,1,11)), + ... [ 0., 0.046656, 0.262144, 0.592704, 0.884736, 1., + ... 0.884736, 0.592704, 0.262144, 0.046656, 0.]) + True + >>> np.allclose(triweight.hns(data), 0.82, rtol=1e-2) + True + >>> np.allclose(triweight.hos(data), 0.88, rtol=1e-2) + True + >>> np.allclose(triweight.hste(data), 0.57, rtol=1e-2) + True + >>> np.allclose(triweight.hscv(data), 0.648, rtol=1e-2) + True + + See also + -------- + mkernel + + References + ---------- + B. W. Silverman (1986) + 'Density estimation for statistics and data analysis' + Chapman and Hall, pp. 43, 76 + + Wand, M. P. and Jones, M. C. (1995) + 'Density estimation for statistics and data analysis' + Chapman and Hall, pp 31, 103, 175 + + """ + + def __init__(self, name, fun='hste'): # 'hns'): + self.kernel = _MKERNEL_DICT[name[:4]] + self.get_smoothing = getattr(self, fun) + + @property + def name(self): + return self.kernel.__class__.__name__.replace('_Kernel', '').title() + + def stats(self): + """Return some 1D statistics of the kernel. + + Returns + ------- + mu2 : real scalar + 2'nd order moment, i.e.,int(x^2*kernel(x)) + R : real scalar + integral of squared kernel, i.e., int(kernel(x)^2) + Rdd : real scalar + integral of squared double derivative of kernel, + i.e., int( (kernel''(x))^2 ). + + Reference + --------- + Wand,M.P. and Jones, M.C. (1995) + 'Kernel smoothing' + Chapman and Hall, pp 176. + + """ + return self.kernel.stats + + def deriv4_6_8_10(self, t, numout=4): + return self.kernel.deriv4_6_8_10(t, numout) + + def effective_support(self): + return self.kernel.effective_support() + + def hns(self, data): + """Returns Normal Scale Estimate of Smoothing Parameter. + + Parameter + --------- + data : 2D array + shape d x n (d = # dimensions ) + + Returns + ------- + h : array-like + one dimensional optimal value for smoothing parameter + given the data and kernel. size D + + HNS only gives an optimal value with respect to mean integrated + square error, when the true underlying distribution + is Gaussian. This works reasonably well if the data resembles a + Gaussian distribution. However if the distribution is asymmetric, + multimodal or have long tails then HNS may return a to large + smoothing parameter, i.e., the KDE may be oversmoothed and mask + important features of the data. (=> large bias). + One way to remedy this is to reduce H by multiplying with a constant + factor, e.g., 0.85. Another is to try different values for H and make a + visual check by eye. + + Example: + data = rndnorm(0, 1,20,1) + h = hns(data,'epan') + + See also: + --------- + hste, hbcv, hboot, hos, hldpi, hlscv, hscv, hstt, kde + + Reference: + --------- + B. W. Silverman (1986) + 'Density estimation for statistics and data analysis' + Chapman and Hall, pp 43-48 + Wand,M.P. and Jones, M.C. (1995) + 'Kernel smoothing' + Chapman and Hall, pp 60--63 + + """ + + a = np.atleast_2d(data) + n = a.shape[1] + + # R= int(mkernel(x)^2), mu2= int(x^2*mkernel(x)) + mu2, R, _Rdd = self.stats() + amise_constant = (8 * sqrt(pi) * R / (3 * mu2 ** 2 * n)) ** (1. / 5) + iqr = iqrange(a, axis=1) # interquartile range + stdA = np.std(a, axis=1, ddof=1) + # use of interquartile range guards against outliers. + # the use of interquartile range is better if + # the distribution is skew or have heavy tails + # This lessen the chance of oversmoothing. + return np.where(iqr > 0, + np.minimum(stdA, iqr / 1.349), stdA) * amise_constant + + def hos(self, data): + """Returns Oversmoothing Parameter. + + Parameter + --------- + data = data matrix, size N x D (D = # dimensions ) + + Returns + ------- + h : vector size 1 x D + one dimensional maximum smoothing value for smoothing parameter + given the data and kernel. + + The oversmoothing or maximal smoothing principle relies on the fact + that there is a simple upper bound for the AMISE-optimal bandwidth for + estimation of densities with a fixed value of a particular scale + measure. While HOS will give too large bandwidth for optimal estimation + of a general density it provides an excellent starting point for + subjective choice of bandwidth. A sensible strategy is to plot an + estimate with bandwidth HOS and then sucessively look at plots based on + convenient fractions of HOS to see what features are present in the + data for various amount of smoothing. The relation to HNS is given by: + + HOS = HNS/0.93 + + Example: + -------- + data = rndnorm(0, 1,20,1) + h = hos(data,'epan'); + + See also hste, hbcv, hboot, hldpi, hlscv, hscv, hstt, kde, kdefun + + Reference + --------- + B. W. Silverman (1986) + 'Density estimation for statistics and data analysis' + Chapman and Hall, pp 43-48 + + Wand,M.P. and Jones, M.C. (1986) + 'Kernel smoothing' + Chapman and Hall, pp 60--63 + + """ + return self.hns(data) / 0.93 + + def _hmns_scale(self, d): + name = self.name[:4].lower() + if name == 'epan': # Epanechnikov kernel + a = (8.0 * (d + 4.0) * (2 * sqrt(pi)) ** d / + sphere_volume(d)) ** (1. / (4.0 + d)) + elif name == 'biwe': # Bi-weight kernel + a = 2.7779 + if d > 2: + raise NotImplementedError('Not implemented for d>2') + elif name == 'triw': # Triweight + a = 3.12 + if d > 2: + raise NotImplementedError('not implemented for d>2') + elif name == 'gaus': # Gaussian kernel + a = (4.0 / (d + 2.0)) ** (1. / (d + 4.0)) + else: + raise ValueError('Unknown kernel.') + return a + + def hmns(self, data): + """Returns Multivariate Normal Scale Estimate of Smoothing Parameter. + + CALL: h = hmns(data,kernel) + + h = M dimensional optimal value for smoothing parameter + given the data and kernel. size D x D + data = data matrix, size D x N (D = # dimensions ) + kernel = 'epanechnikov' - Epanechnikov kernel. + 'biweight' - Bi-weight kernel. + 'triweight' - Tri-weight kernel. + 'gaussian' - Gaussian kernel + + Note that only the first 4 letters of the kernel name is needed. + + HMNS only gives a optimal value with respect to mean integrated + square error, when the true underlying distribution is Multivariate + Gaussian. This works reasonably well if the data resembles a + Multivariate Gaussian distribution. However if the distribution is + asymmetric, multimodal or have long tails then HNS is maybe more + appropriate. + + Example: + data = rndnorm(0, 1,20,2) + h = hmns(data,'epan') + + See also + -------- + + hns, hste, hbcv, hboot, hos, hldpi, hlscv, hscv, hstt + + Reference + ---------- + B. W. Silverman (1986) + 'Density estimation for statistics and data analysis' + Chapman and Hall, pp 43-48, 87 + + Wand,M.P. and Jones, M.C. (1995) + 'Kernel smoothing' + Chapman and Hall, pp 60--63, 86--88 + + """ + # TODO: implement more kernels + + a = np.atleast_2d(data) + d, n = a.shape + if d == 1: + return self.hns(data) + scale = self._hmns_scale(d) + cov_a = np.cov(a) + return scale * linalg.sqrtm(cov_a).real * n ** (-1. / (d + 4)) + + def hste(self, data, h0=None, inc=128, maxit=100, releps=0.01, abseps=0.0): + '''HSTE 2-Stage Solve the Equation estimate of smoothing parameter. + + CALL: hs = hste(data,kernel,h0) + + hs = one dimensional value for smoothing parameter + given the data and kernel. size 1 x D + data = data matrix, size N x D (D = # dimensions ) + kernel = 'gaussian' - Gaussian kernel (default) + ( currently the only supported kernel) + h0 = initial starting guess for hs (default h0=hns(A,kernel)) + + Example: + x = rndnorm(0,1,50,1); + hs = hste(x,'gauss'); + + See also hbcv, hboot, hos, hldpi, hlscv, hscv, hstt, kde, kdefun + + Reference + --------- + B. W. Silverman (1986) + 'Density estimation for statistics and data analysis' + Chapman and Hall, pp 57--61 + + Wand,M.P. and Jones, M.C. (1986) + 'Kernel smoothing' + Chapman and Hall, pp 74--75 + ''' + # TODO: NB: this routine can be made faster: + # TODO: replace the iteration in the end with a Newton Raphson scheme + + A = np.atleast_2d(data) + d, n = A.shape + + # R = int(mkernel(x)^2), mu2 = int(x^2*mkernel(x)) + mu2, R, _Rdd = self.stats() + + amise_constant = (8 * sqrt(pi) * R / (3 * mu2 ** 2 * n)) ** (1. / 5) + ste_constant = R / (mu2 ** (2) * n) + + sigmaA = self.hns(A) / amise_constant + if h0 is None: + h0 = sigmaA * amise_constant + + h = np.asarray(h0, dtype=float) + + nfft = inc * 2 + amin = A.min(axis=1) # Find the minimum value of A. + amax = A.max(axis=1) # Find the maximum value of A. + arange = amax - amin # Find the range of A. + + # xa holds the x 'axis' vector, defining a grid of x values where + # the k.d. function will be evaluated. + + ax1 = amin - arange / 8.0 + bx1 = amax + arange / 8.0 + + kernel2 = Kernel('gauss') + mu2, R, _Rdd = kernel2.stats() + ste_constant2 = R / (mu2 ** (2) * n) + fft = np.fft.fft + ifft = np.fft.ifft + + for dim in range(d): + s = sigmaA[dim] + ax = ax1[dim] + bx = bx1[dim] + + xa = np.linspace(ax, bx, inc) + xn = np.linspace(0, bx - ax, inc) + + c = gridcount(A[dim], xa) + + # Step 1 + psi6NS = -15 / (16 * sqrt(pi) * s ** 7) + psi8NS = 105 / (32 * sqrt(pi) * s ** 9) + + # Step 2 + k40, k60 = kernel2.deriv4_6_8_10(0, numout=2) + g1 = (-2 * k40 / (mu2 * psi6NS * n)) ** (1.0 / 7) + g2 = (-2 * k60 / (mu2 * psi8NS * n)) ** (1.0 / 9) + + # Estimate psi6 given g2. + # kernel weights. + kw4, kw6 = kernel2.deriv4_6_8_10(xn / g2, numout=2) + # Apply fftshift to kw. + kw = np.r_[kw6, 0, kw6[-1:0:-1]] + z = np.real(ifft(fft(c, nfft) * fft(kw))) # convolution. + psi6 = np.sum(c * z[:inc]) / (n * (n - 1) * g2 ** 7) + + # Estimate psi4 given g1. + kw4 = kernel2.deriv4_6_8_10(xn / g1, numout=1) # kernel weights. + kw = np.r_[kw4, 0, kw4[-1:0:-1]] # Apply 'fftshift' to kw. + z = np.real(ifft(fft(c, nfft) * fft(kw))) # convolution. + psi4 = np.sum(c * z[:inc]) / (n * (n - 1) * g1 ** 5) + + h1 = h[dim] + h_old = 0 + count = 0 + + while ((abs(h_old - h1) > max(releps * h1, abseps)) and + (count < maxit)): + count += 1 + h_old = h1 + + # Step 3 + gamma_ = ((2 * k40 * mu2 * psi4 * h1 ** 5) / + (-psi6 * R)) ** (1.0 / 7) + + # Now estimate psi4 given gamma_. + # kernel weights. + kw4 = kernel2.deriv4_6_8_10(xn / gamma_, numout=1) + kw = np.r_[kw4, 0, kw4[-1:0:-1]] # Apply 'fftshift' to kw. + z = np.real(ifft(fft(c, nfft) * fft(kw))) # convolution. + + psi4Gamma = np.sum(c * z[:inc]) / (n * (n - 1) * gamma_ ** 5) + + # Step 4 + h1 = (ste_constant2 / psi4Gamma) ** (1.0 / 5) + + # Kernel other than Gaussian scale bandwidth + h1 = h1 * (ste_constant / ste_constant2) ** (1.0 / 5) + + if count >= maxit: + warnings.warn('The obtained value did not converge.') + + h[dim] = h1 + # end for dim loop + return h + + def hisj(self, data, inc=512, L=7): + ''' + HISJ Improved Sheather-Jones estimate of smoothing parameter. + + Unlike many other implementations, this one is immune to problems + caused by multimodal densities with widely separated modes. The + estimation does not deteriorate for multimodal densities, because + it do not assume a parametric model for the data. + + Parameters + ---------- + data - a vector of data from which the density estimate is constructed + inc - the number of mesh points used in the uniform discretization + + Returns + ------- + bandwidth - the optimal bandwidth + + Reference + --------- + Kernel density estimation via diffusion + Z. I. Botev, J. F. Grotowski, and D. P. Kroese (2010) + Annals of Statistics, Volume 38, Number 5, pages 2916-2957. + ''' + A = np.atleast_2d(data) + d, n = A.shape + + # R = int(mkernel(x)^2), mu2 = int(x^2*mkernel(x)) + mu2, R, _Rdd = self.stats() + ste_constant = R / (n * mu2 ** 2) + + amin = A.min(axis=1) # Find the minimum value of A. + amax = A.max(axis=1) # Find the maximum value of A. + arange = amax - amin # Find the range of A. + + # xa holds the x 'axis' vector, defining a grid of x values where + # the k.d. function will be evaluated. + + ax1 = amin - arange / 8.0 + bx1 = amax + arange / 8.0 + + kernel2 = Kernel('gauss') + mu2, R, _Rdd = kernel2.stats() + ste_constant2 = R / (mu2 ** (2) * n) + + def fixed_point(t, N, I, a2): + ''' this implements the function t-zeta*gamma^[L](t)''' + + prod = np.prod + # L = 7 + logI = np.log(I) + f = 2 * pi ** (2 * L) * \ + (a2 * exp(L * logI - I * pi ** 2 * t)).sum() + for s in range(L - 1, 1, -1): + K0 = prod(np.r_[1:2 * s:2]) / sqrt(2 * pi) + const = (1 + (1. / 2) ** (s + 1. / 2)) / 3 + time = (2 * const * K0 / N / f) ** (2. / (3 + 2 * s)) + f = 2 * pi ** (2 * s) * \ + (a2 * exp(s * logI - I * pi ** 2 * time)).sum() + return t - (2 * N * sqrt(pi) * f) ** (-2. / 5) + + h = np.empty(d) + for dim in range(d): + ax = ax1[dim] + bx = bx1[dim] + xa = np.linspace(ax, bx, inc) + R = bx - ax + + c = gridcount(A[dim], xa) + N = len(set(A[dim])) + a = dct(c / len(A[dim]), norm=None) + + # now compute the optimal bandwidth^2 using the referenced method + I = np.asfarray(np.arange(1, inc)) ** 2 + a2 = (a[1:] / 2) ** 2 + + def fun(t): + return fixed_point(t, N, I, a2) + x = np.linspace(0, 0.1, 150) + ai = x[0] + f0 = fun(ai) + for bi in x[1:]: + f1 = fun(bi) + if f1 * f0 <= 0: + # print('ai = %g, bi = %g' % (ai,bi)) + break + else: + ai = bi + # y = np.asarray([fun(j) for j in x]) + # plt.figure(1) + # plt.plot(x,y) + # plt.show() + + # use fzero to solve the equation t=zeta*gamma^[5](t) + try: + t_star = optimize.brentq(fun, a=ai, b=bi) + except: + t_star = 0.28 * N ** (-2. / 5) + warnings.warn('Failure in obtaining smoothing parameter') + + # smooth the discrete cosine transform of initial data using t_star + # a_t = a*exp(-np.arange(inc)**2*pi**2*t_star/2) + # now apply the inverse discrete cosine transform + # density = idct(a_t)/R; + + # take the rescaling of the data into account + bandwidth = sqrt(t_star) * R + + # Kernel other than Gaussian scale bandwidth + h[dim] = bandwidth * (ste_constant / ste_constant2) ** (1.0 / 5) + # end for dim loop + return h + + def hstt(self, data, h0=None, inc=128, maxit=100, releps=0.01, abseps=0.0): + '''HSTT Scott-Tapia-Thompson estimate of smoothing parameter. + + CALL: hs = hstt(data,kernel) + + hs = one dimensional value for smoothing parameter + given the data and kernel. size 1 x D + data = data matrix, size N x D (D = # dimensions ) + kernel = 'epanechnikov' - Epanechnikov kernel. (default) + 'biweight' - Bi-weight kernel. + 'triweight' - Tri-weight kernel. + 'triangular' - Triangular kernel. + 'gaussian' - Gaussian kernel + 'rectangular' - Rectangular kernel. + 'laplace' - Laplace kernel. + 'logistic' - Logistic kernel. + + HSTT returns Scott-Tapia-Thompson (STT) estimate of smoothing + parameter. This is a Solve-The-Equation rule (STE). + Simulation studies shows that the STT estimate of HS + is a good choice under a variety of models. A comparison with + likelihood cross-validation (LCV) indicates that LCV performs slightly + better for short tailed densities. + However, STT method in contrast to LCV is insensitive to outliers. + + Example + ------- + x = rndnorm(0,1,50,1); + hs = hstt(x,'gauss'); + + See also + -------- + hste, hbcv, hboot, hos, hldpi, hlscv, hscv, kde, kdebin + + Reference + --------- + B. W. Silverman (1986) + 'Density estimation for statistics and data analysis' + Chapman and Hall, pp 57--61 + ''' + A = np.atleast_2d(data) + d, n = A.shape + + # R= int(mkernel(x)^2), mu2= int(x^2*mkernel(x)) + mu2, R, _Rdd = self.stats() + + amise_constant = (8 * sqrt(pi) * R / (3 * mu2 ** 2 * n)) ** (1. / 5) + ste_constant = R / (mu2 ** (2) * n) + + sigmaA = self.hns(A) / amise_constant + if h0 is None: + h0 = sigmaA * amise_constant + + h = np.asarray(h0, dtype=float) + + nfft = inc * 2 + amin = A.min(axis=1) # Find the minimum value of A. + amax = A.max(axis=1) # Find the maximum value of A. + arange = amax - amin # Find the range of A. + + # xa holds the x 'axis' vector, defining a grid of x values where + # the k.d. function will be evaluated. + + ax1 = amin - arange / 8.0 + bx1 = amax + arange / 8.0 + + fft = np.fft.fft + ifft = np.fft.ifft + for dim in range(d): + s = sigmaA[dim] + datan = A[dim] / s + ax = ax1[dim] / s + bx = bx1[dim] / s + + xa = np.linspace(ax, bx, inc) + xn = np.linspace(0, bx - ax, inc) + + c = gridcount(datan, xa) + + count = 1 + h_old = 0 + h1 = h[dim] / s + delta = (bx - ax) / (inc - 1) + while ((abs(h_old - h1) > max(releps * h1, abseps)) and + (count < maxit)): + count += 1 + h_old = h1 + + kw4 = self.kernel(xn / h1) / (n * h1 * self.norm_factor(d=1)) + kw = np.r_[kw4, 0, kw4[-1:0:-1]] # Apply 'fftshift' to kw. + f = np.real(ifft(fft(c, nfft) * fft(kw))) # convolution. + + # Estimate psi4=R(f'') using simple finite differences and + # quadrature. + ix = np.arange(1, inc - 1) + z = ((f[ix + 1] - 2 * f[ix] + f[ix - 1]) / delta ** 2) ** 2 + psi4 = delta * z.sum() + h1 = (ste_constant / psi4) ** (1. / 5) + + if count >= maxit: + warnings.warn('The obtained value did not converge.') + + h[dim] = h1 * s + # end % for dim loop + return h + + def hscv(self, data, hvec=None, inc=128, maxit=100, fulloutput=False): + ''' + HSCV Smoothed cross-validation estimate of smoothing parameter. + + CALL: [hs,hvec,score] = hscv(data,kernel,hvec) + + hs = smoothing parameter + hvec = vector defining possible values of hs + (default linspace(0.25*h0,h0,100), h0=0.62) + score = score vector + data = data vector + kernel = 'gaussian' - Gaussian kernel the only supported + + Note that only the first 4 letters of the kernel name is needed. + + Example: + data = rndnorm(0,1,20,1) + [hs hvec score] = hscv(data,'epan'); + plot(hvec,score) + See also hste, hbcv, hboot, hos, hldpi, hlscv, hstt, kde, kdefun + + Wand,M.P. and Jones, M.C. (1986) + 'Kernel smoothing' + Chapman and Hall, pp 75--79 + ''' + # TODO: Add support for other kernels than Gaussian + A = np.atleast_2d(data) + d, n = A.shape + + # R= int(mkernel(x)^2), mu2= int(x^2*mkernel(x)) + mu2, R, _Rdd = self.stats() + + amise_constant = (8 * sqrt(pi) * R / (3 * mu2 ** 2 * n)) ** (1. / 5) + ste_constant = R / (mu2 ** (2) * n) + + sigmaA = self.hns(A) / amise_constant + if hvec is None: + H = amise_constant / 0.93 + hvec = np.linspace(0.25 * H, H, maxit) + hvec = np.asarray(hvec, dtype=float) + + steps = len(hvec) + score = np.zeros(steps) + + nfft = inc * 2 + amin = A.min(axis=1) # Find the minimum value of A. + amax = A.max(axis=1) # Find the maximum value of A. + arange = amax - amin # Find the range of A. + + # xa holds the x 'axis' vector, defining a grid of x values where + # the k.d. function will be evaluated. + + ax1 = amin - arange / 8.0 + bx1 = amax + arange / 8.0 + + kernel2 = Kernel('gauss') + mu2, R, _Rdd = kernel2.stats() + ste_constant2 = R / (mu2 ** (2) * n) + fft = np.fft.fft + ifft = np.fft.ifft + + h = np.zeros(d) + hvec = hvec * (ste_constant2 / ste_constant) ** (1. / 5.) + + k40, k60, k80, k100 = kernel2.deriv4_6_8_10(0, numout=4) + psi8 = 105 / (32 * sqrt(pi)) + psi12 = 3465. / (512 * sqrt(pi)) + g1 = (-2. * k60 / (mu2 * psi8 * n)) ** (1. / 9.) + g2 = (-2. * k100 / (mu2 * psi12 * n)) ** (1. / 13.) + + for dim in range(d): + s = sigmaA[dim] + ax = ax1[dim] / s + bx = bx1[dim] / s + datan = A[dim] / s + + xa = np.linspace(ax, bx, inc) + xn = np.linspace(0, bx - ax, inc) + + c = gridcount(datan, xa) + + kw4, kw6 = kernel2.deriv4_6_8_10(xn / g1, numout=2) + kw = np.r_[kw6, 0, kw6[-1:0:-1]] + z = np.real(ifft(fft(c, nfft) * fft(kw))) + psi6 = np.sum(c * z[:inc]) / (n ** 2 * g1 ** 7) + + kw4, kw6, kw8, kw10 = kernel2.deriv4_6_8_10(xn / g2, numout=4) + kw = np.r_[kw10, 0, kw10[-1:0:-1]] + z = np.real(ifft(fft(c, nfft) * fft(kw))) + psi10 = np.sum(c * z[:inc]) / (n ** 2 * g2 ** 11) + + g3 = (-2. * k40 / (mu2 * psi6 * n)) ** (1. / 7.) + g4 = (-2. * k80 / (mu2 * psi10 * n)) ** (1. / 11.) + + kw4 = kernel2.deriv4_6_8_10(xn / g3, numout=1) + kw = np.r_[kw4, 0, kw4[-1:0:-1]] + z = np.real(ifft(fft(c, nfft) * fft(kw))) + psi4 = np.sum(c * z[:inc]) / (n ** 2 * g3 ** 5) + + kw4, kw6, kw8 = kernel2.deriv4_6_8_10(xn / g3, numout=3) + kw = np.r_[kw8, 0, kw8[-1:0:-1]] + z = np.real(ifft(fft(c, nfft) * fft(kw))) + psi8 = np.sum(c * z[:inc]) / (n ** 2 * g4 ** 9) + + const = (441. / (64 * pi)) ** (1. / 18.) * \ + (4 * pi) ** (-1. / 5.) * \ + psi4 ** (-2. / 5.) * psi8 ** (-1. / 9.) + + M = np.atleast_2d(datan) + + Y = (M - M.T).ravel() + + for i in range(steps): + g = const * n ** (-23. / 45) * hvec[i] ** (-2) + sig1 = sqrt(2 * hvec[i] ** 2 + 2 * g ** 2) + sig2 = sqrt(hvec[i] ** 2 + 2 * g ** 2) + sig3 = sqrt(2 * g ** 2) + term2 = np.sum(kernel2(Y / sig1) / sig1 - 2 * kernel2( + Y / sig2) / sig2 + kernel2(Y / sig3) / sig3) + + score[i] = 1. / (n * hvec[i] * 2. * sqrt(pi)) + term2 / n ** 2 + + idx = score.argmin() + # Kernel other than Gaussian scale bandwidth + h[dim] = hvec[idx] * (ste_constant / ste_constant2) ** (1 / 5) + if idx == 0: + warnings.warn("Optimum is probably lower than " + "hs={0:g} for dim={1:d}".format(h[dim] * s, dim)) + elif idx == maxit - 1: + msg = "Optimum is probably higher than hs={0:g] for dim={1:d}" + warnings.warn(msg.format(h[dim] * s, dim)) + + hvec = hvec * (ste_constant / ste_constant2) ** (1 / 5) + if fulloutput: + return h * sigmaA, score, hvec, sigmaA + else: + return h * sigmaA + + def hldpi(self, data, L=2, inc=128): + '''HLDPI L-stage Direct Plug-In estimate of smoothing parameter. + + CALL: hs = hldpi(data,kernel,L) + + hs = one dimensional value for smoothing parameter + given the data and kernel. size 1 x D + data = data matrix, size N x D (D = # dimensions ) + kernel = 'epanechnikov' - Epanechnikov kernel. + 'biweight' - Bi-weight kernel. + 'triweight' - Tri-weight kernel. + 'triangluar' - Triangular kernel. + 'gaussian' - Gaussian kernel + 'rectangular' - Rectanguler kernel. + 'laplace' - Laplace kernel. + 'logistic' - Logistic kernel. + L = 0,1,2,3,... (default 2) + + Note that only the first 4 letters of the kernel name is needed. + + Example: + x = rndnorm(0,1,50,1); + hs = hldpi(x,'gauss',1); + + See also hste, hbcv, hboot, hos, hlscv, hscv, hstt, kde, kdefun + + Wand,M.P. and Jones, M.C. (1995) + 'Kernel smoothing' + Chapman and Hall, pp 67--74 + ''' + A = np.atleast_2d(data) + d, n = A.shape + + # R= int(mkernel(x)^2), mu2= int(x^2*mkernel(x)) + mu2, R, _Rdd = self.stats() + + amise_constant = (8 * sqrt(pi) * R / (3 * n * mu2 ** 2)) ** (1. / 5) + ste_constant = R / (n * mu2 ** 2) + + sigmaA = self.hns(A) / amise_constant + + nfft = inc * 2 + amin = A.min(axis=1) # Find the minimum value of A. + amax = A.max(axis=1) # Find the maximum value of A. + arange = amax - amin # Find the range of A. + + # xa holds the x 'axis' vector, defining a grid of x values where + # the k.d. function will be evaluated. + + ax1 = amin - arange / 8.0 + bx1 = amax + arange / 8.0 + + kernel2 = Kernel('gauss') + mu2, _R, _Rdd = kernel2.stats() + + fft = np.fft.fft + ifft = np.fft.ifft + + h = np.zeros(d) + for dim in range(d): + s = sigmaA[dim] + datan = A[dim] # / s + ax = ax1[dim] # / s + bx = bx1[dim] # / s + + xa = np.linspace(ax, bx, inc) + xn = np.linspace(0, bx - ax, inc) + + c = gridcount(datan, xa) + + r = 2 * L + 4 + rd2 = L + 2 + + # Eq. 3.7 in Wand and Jones (1995) + psi_r = (-1) ** (rd2) * np.prod( + np.r_[rd2 + 1:r + 1]) / (sqrt(pi) * (2 * s) ** (r + 1)) + psi = psi_r + if L > 0: + # High order derivatives of the Gaussian kernel + Kd = kernel2.deriv4_6_8_10(0, numout=L) + + # L-stage iterations to estimate PSI_4 + for ix in range(L, 0, -1): + gi = (-2 * Kd[ix - 1] / + (mu2 * psi * n)) ** (1. / (2 * ix + 5)) + + # Obtain the kernel weights. + kw0 = kernel2.deriv4_6_8_10(xn / gi, numout=ix) + if ix > 1: + kw0 = kw0[-1] + # Apply 'fftshift' to kw. + kw = np.r_[kw0, 0, kw0[inc - 1:0:-1]] + + # Perform the convolution. + z = np.real(ifft(fft(c, nfft) * fft(kw))) + + psi = np.sum(c * z[:inc]) / (n ** 2 * gi ** (2 * ix + 3)) + # end + # end + h[dim] = (ste_constant / psi) ** (1. / 5) + return h + + def norm_factor(self, d=1, n=None): + return self.kernel.norm_factor(d, n) + + def eval_points(self, points): + return self.kernel(np.atleast_2d(points)) + __call__ = eval_points + + +def mkernel(X, kernel): + """MKERNEL Multivariate Kernel Function. + + Paramaters + ---------- + X : array-like + matrix size d x n (d = # dimensions, n = # evaluation points) + kernel : string + defining kernel + 'epanechnikov' - Epanechnikov kernel. + 'biweight' - Bi-weight kernel. + 'triweight' - Tri-weight kernel. + 'p1epanechnikov' - product of 1D Epanechnikov kernel. + 'p1biweight' - product of 1D Bi-weight kernel. + 'p1triweight' - product of 1D Tri-weight kernel. + 'triangular' - Triangular kernel. + 'gaussian' - Gaussian kernel + 'rectangular' - Rectangular kernel. + 'laplace' - Laplace kernel. + 'logistic' - Logistic kernel. + Note that only the first 4 letters of the kernel name is needed. + + Returns + ------- + z : ndarray + kernel function values evaluated at X + + See also + -------- + kde, kdefun, kdebin + + References + ---------- + B. W. Silverman (1986) + 'Density estimation for statistics and data analysis' + Chapman and Hall, pp. 43, 76 + + Wand, M. P. and Jones, M. C. (1995) + 'Density estimation for statistics and data analysis' + Chapman and Hall, pp 31, 103, 175 + + """ + fun = _MKERNEL_DICT[kernel[:4]] + return fun(np.atleast_2d(X)) + + +if __name__ == '__main__': + test_docstrings(__file__) diff --git a/wafo/testing.py b/wafo/testing.py new file mode 100644 index 0000000..6d5329b --- /dev/null +++ b/wafo/testing.py @@ -0,0 +1,18 @@ +''' +Created on 15. des. 2016 + +@author: pab +''' +import inspect + + +def test_docstrings(name=''): + import doctest + if not name: + name = inspect.stack()[1][1] + print('Testing docstrings in {}'.format(name)) + doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE | + doctest.ELLIPSIS) + +if __name__ == '__main__': + pass diff --git a/wafo/tests/test_kdetools.py b/wafo/tests/test_kdetools.py index 9db6d6a..bcbc0a7 100644 --- a/wafo/tests/test_kdetools.py +++ b/wafo/tests/test_kdetools.py @@ -3,398 +3,473 @@ Created on 20. nov. 2010 @author: pab ''' - -import numpy as np # @UnusedImport -from numpy import array # @UnusedImport -import wafo.kdetools as wk # @UnusedImport -# import pylab as plb - - -def test0_KDE1D(): - ''' - >>> data = array([0.75355792, 0.72779194, 0.94149169, 0.07841119, - ... 2.32291887, 1.10419995, 0.77055114, 0.60288273, - ... 1.36883635, 1.74754326, 1.09547561, 1.01671133, - ... 0.73211143, 0.61891719, 0.75903487, 1.8919469, - ... 0.72433808, 1.92973094, 0.44749838, 1.36508452]) - - >>> x = np.linspace(0, max(data.ravel()) + 1, 10) - >>> import wafo.kdetools as wk - >>> kde = wk.KDE(data, hs=0.5, alpha=0.5) - - >>> kde0 = wk.KDE(data, hs=0.5, alpha=0.0, inc=16) - - >>> kde0.eval_grid(x) - array([ 0.2039735 , 0.40252503, 0.54595078, 0.52219649, 0.3906213 , - 0.26381501, 0.16407362, 0.08270612, 0.02991145, 0.00720821]) - >>> kde0.eval_grid_fast(x) - array([ 0.20729484, 0.39865044, 0.53716945, 0.5169322 , 0.39060223, - 0.26441126, 0.16388801, 0.08388527, 0.03227164, 0.00883579]) - - >>> f = kde0.eval_grid_fast(); f - array([ 0.06807544, 0.12949095, 0.21985421, 0.33178031, 0.44334874, - 0.52429234, 0.55140336, 0.52221323, 0.45500674, 0.3752208 , - 0.30046799, 0.235667 , 0.17854402, 0.12721305, 0.08301993, - 0.04862324]) - >>> np.allclose(np.trapz(f,kde0.args), array([ 0.96716261])) - True - ''' - - -def test1_TKDE1D(): - ''' - N = 20 - data = np.random.rayleigh(1, size=(N,)) - >>> data = array([0.75355792, 0.72779194, 0.94149169, 0.07841119, - ... 2.32291887, 1.10419995, 0.77055114, 0.60288273, - ... 1.36883635, 1.74754326, 1.09547561, 1.01671133, - ... 0.73211143, 0.61891719, 0.75903487, 1.8919469, - ... 0.72433808, 1.92973094, 0.44749838, 1.36508452]) - - >>> x = np.linspace(0.01, max(data.ravel()) + 1, 10) - >>> kde = wk.TKDE(data, hs=0.5, L2=0.5) - >>> f = kde(x) - >>> f - array([ 1.03982714, 0.45839018, 0.39514782, 0.32860602, 0.26433318, - 0.20717946, 0.15907684, 0.1201074 , 0.08941027, 0.06574882]) - - >>> np.allclose(np.trapz(f, x), 0.94787730659349068) - True - - h1 = plb.plot(x, f) # 1D probability density plot - ''' - - -def test1_KDE1D(): - ''' - N = 20 - data = np.random.rayleigh(1, size=(N,)) - >>> data = array([0.75355792, 0.72779194, 0.94149169, 0.07841119, - ... 2.32291887, 1.10419995, 0.77055114, 0.60288273, - ... 1.36883635, 1.74754326, 1.09547561, 1.01671133, - ... 0.73211143, 0.61891719, 0.75903487, 1.8919469, - ... 0.72433808, 1.92973094, 0.44749838, 1.36508452]) - - >>> x = np.linspace(0, max(data.ravel()) + 1, 10) - >>> kde = wk.KDE(data, hs=0.5) - >>> f = kde(x) - >>> np.allclose(f, [ 0.2039735 , 0.40252503, 0.54595078, 0.52219649, - ... 0.3906213, 0.26381501, 0.16407362, 0.08270612, 0.02991145, - ... 0.00720821]) - True - >>> np.allclose(np.trapz(f, x), 0.92576174424281876) - True - - h1 = plb.plot(x, f) # 1D probability density plot - ''' - - -def test2_KDE1D(): - ''' - N = 20 - data = np.random.rayleigh(1, size=(N,)) - >>> data = array([ 0.75355792, 0.72779194, 0.94149169, 0.07841119, - ... 2.32291887, 1.10419995, 0.77055114, 0.60288273, 1.36883635, - ... 1.74754326, 1.09547561, 1.01671133, 0.73211143, 0.61891719, - ... 0.75903487, 1.8919469, 0.72433808, 1.92973094, 0.44749838, - ... 1.36508452]) - - >>> data = np.asarray([1,2]) - >>> x = np.linspace(0, max(data.ravel()) + 1, 10) - >>> kde = wk.KDE(data, hs=0.5) - >>> f = kde(x) - >>> np.allclose(f, - ... [ 0.0541248 , 0.16555235, 0.33084399, 0.45293325, 0.48345808, - ... 0.48345808, 0.45293325, 0.33084399, 0.16555235, 0.0541248 ]) - True - >>> np.allclose(np.trapz(f, x), 0.97323338046725172) - True - - h1 = plb.plot(x, f) # 1D probability density plot - ''' - - -def test1a_KDE1D(): - ''' - N = 20 - data = np.random.rayleigh(1, size=(N,)) - >>> data = array([ - ... 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887, - ... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, - ... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, - ... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452]) - - >>> x = np.linspace(0, max(data.ravel()) + 1, 10) - >>> kde = wk.KDE(data, hs=0.5, alpha=0.5) - >>> f = kde(x) - >>> np.allclose(f, - ... [ 0.17252055, 0.41014271, 0.61349072, 0.57023834, 0.37198073, - ... 0.21409279, 0.12738463, 0.07460326, 0.03956191, 0.01887164]) - True - >>> np.allclose(np.trapz(f, x), 0.92938023659047952) - True - - h1 = plb.plot(x, f) # 1D probability density plot - ''' - - -def test2a_KDE1D(): - ''' - N = 20 - data = np.random.rayleigh(1, size=(N,)) - >>> data = array([ - ... 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887, - ... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, - ... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, - ... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452]) - - >>> data = np.asarray([1,2]) - >>> x = np.linspace(0, max(data.ravel()) + 1, 10) - >>> kde = wk.KDE(data, hs=0.5, alpha=0.5) - >>> f = kde(x) - >>> np.allclose(f, - ... [ 0.0541248 , 0.16555235, 0.33084399, 0.45293325, 0.48345808, - ... 0.48345808, 0.45293325, 0.33084399, 0.16555235, 0.0541248 ]) - True - >>> np.allclose(np.trapz(f, x), 0.97323338046725172) - True - - h1 = plb.plot(x, f) # 1D probability density plot - ''' - - -def test_KDE2D(): - ''' - N = 20 - data = np.random.rayleigh(1, size=(2, N)) - >>> data = array([[ - ... 0.38103275, 0.35083136, 0.90024207, 1.88230239, 0.96815399, - ... 0.57392873, 1.63367908, 1.20944125, 2.03887811, 0.81789145, - ... 0.69302049, 1.40856592, 0.92156032, 2.14791432, 2.04373821, - ... 0.69800708, 0.58428735, 1.59128776, 2.05771405, 0.87021964], - ... [1.44080694, 0.39973751, 1.331243 , 2.48895822, 1.18894158, - ... 1.40526085, 1.01967897, 0.81196474, 1.37978932, 2.03334689, - ... 0.870329 , 1.25106862, 0.5346619 , 0.47541236, 1.51930093, - ... 0.58861519, 1.19780448, 0.81548296, 1.56859488, 1.60653533]]) - - >>> x = np.linspace(0, max(data.ravel()) + 1, 3) - - >>> kde = wk.KDE(data, hs=0.5, alpha=0.5) - - >>> kde0 = wk.KDE(data, hs=0.5, alpha=0.0, inc=16) - - >>> np.allclose(kde0.eval_grid(x, x), - ... [[ 3.27260963e-02, 4.21654678e-02, 5.85338634e-04], - ... [ 6.78845466e-02, 1.42195839e-01, 1.41676003e-03], - ... [ 1.39466746e-04, 4.26983850e-03, 2.52736185e-05]]) - True - >>> np.allclose(kde0.eval_grid_fast(x, x), - ... [[ 0.04435061, 0.06433531, 0.00413538], - ... [ 0.07218297, 0.12358196, 0.00928889], - ... [ 0.00161333, 0.00794858, 0.00058748]]) - True - ''' - - -def test_smooth_params(): - ''' - >>> data = np.array([[ - ... 0.932896 , 0.89522635, 0.80636346, 1.32283371, 0.27125435, - ... 1.91666304, 2.30736635, 1.13662384, 1.73071287, 1.06061127, - ... 0.99598512, 2.16396591, 1.23458213, 1.12406686, 1.16930431, - ... 0.73700592, 1.21135139, 0.46671506, 1.3530304 , 0.91419104], - ... [ 0.62759088, 0.23988169, 2.04909823, 0.93766571, 1.19343762, - ... 1.94954931, 0.84687514, 0.49284897, 1.05066204, 1.89088505, - ... 0.840738 , 1.02901457, 1.0758625 , 1.76357967, 0.45792897, - ... 1.54488066, 0.17644313, 1.6798871 , 0.72583514, 2.22087245], - ... [ 1.69496432, 0.81791905, 0.82534709, 0.71642389, 0.89294732, - ... 1.66888649, 0.69036947, 0.99961448, 0.30657267, 0.98798713, - ... 0.83298728, 1.83334948, 1.90144186, 1.25781913, 0.07122458, - ... 2.42340852, 2.41342037, 0.87233305, 1.17537114, 1.69505988]]) - - >>> gauss = wk.Kernel('gaussian') - >>> gauss.hns(data) - array([ 0.18154437, 0.36207987, 0.37396219]) - >>> gauss.hos(data) - array([ 0.195209 , 0.3893332 , 0.40210988]) - >>> gauss.hmns(data) - array([[ 3.25196193e-01, -2.68892467e-02, 3.18932448e-04], - [ -2.68892467e-02, 3.91283306e-01, 2.38654678e-02], - [ 3.18932448e-04, 2.38654678e-02, 4.05123874e-01]]) - >>> gauss.hscv(data) - array([ 0.16858959, 0.32739383, 0.3046287 ]) - - >>> gauss.hstt(data) - array([ 0.18099075, 0.50409881, 0.11018912]) - - >>> gauss.hste(data) - array([ 0.16750009, 0.29059113, 0.17994255]) - - >>> gauss.hldpi(data) - array([ 0.1732289 , 0.33159097, 0.3107633 ]) - - >>> np.allclose(gauss.hisj(data), - ... array([ 0.29542502, 0.74277133, 0.51899114])) - True - ''' - - -def test_gridcount_1D(): - ''' - N = 20 - data = np.random.rayleigh(1, size=(N,)) - >>> data = array([ - ... 0.75355792, 0.72779194, 0.94149169, 0.07841119, 2.32291887, - ... 1.10419995, 0.77055114, 0.60288273, 1.36883635, 1.74754326, - ... 1.09547561, 1.01671133, 0.73211143, 0.61891719, 0.75903487, - ... 1.8919469 , 0.72433808, 1.92973094, 0.44749838, 1.36508452]) - - >>> x = np.linspace(0, max(data.ravel()) + 1, 10) - >>> dx = x[1] - x[0] - >>> c = wk.gridcount(data, x) - >>> np.allclose(c, - ... [ 0.78762626, 1.77520717, 7.99190087, 4.04054449, 1.67156643, - ... 2.38228499, 1.05933195, 0.29153785, 0. , 0. ]) - True - - h = plb.plot(x, c, '.') # 1D histogram - - h1 = plb.plot(x, c / dx / N) # 1D probability density plot - t = np.trapz(c / dx / N, x) - print(t) - ''' - - -def test_gridcount_2D(): - ''' - N = 20 - data = np.random.rayleigh(1, size=(2, N)) - >>> data = array([[ - ... 0.38103275, 0.35083136, 0.90024207, 1.88230239, 0.96815399, - ... 0.57392873, 1.63367908, 1.20944125, 2.03887811, 0.81789145, - ... 0.69302049, 1.40856592, 0.92156032, 2.14791432, 2.04373821, - ... 0.69800708, 0.58428735, 1.59128776, 2.05771405, 0.87021964], - ... [ 1.44080694, 0.39973751, 1.331243 , 2.48895822, 1.18894158, - ... 1.40526085, 1.01967897, 0.81196474, 1.37978932, 2.03334689, - ... 0.870329 , 1.25106862, 0.5346619 , 0.47541236, 1.51930093, - ... 0.58861519, 1.19780448, 0.81548296, 1.56859488, 1.60653533]]) - - >>> x = np.linspace(0, max(data.ravel()) + 1, 5) - >>> dx = x[1] - x[0] - >>> X = np.vstack((x, x)) - >>> c = wk.gridcount(data, X) - >>> np.allclose(c, - ... [[ 0.38922806, 0.8987982 , 0.34676493, 0.21042807, 0. ], - ... [ 1.15012203, 5.16513541, 3.19250588, 0.55420752, 0. ], - ... [ 0.74293418, 3.42517219, 1.97923195, 0.76076621, 0. ], - ... [ 0.02063536, 0.31054405, 0.71865964, 0.13486633, 0. ], - ... [ 0. , 0. , 0. , 0. , 0. ]]) - True - - h = plb.plot(x, c, '.') # 1D histogram - - h1 = plb.plot(x, c / dx / N) # 1D probability density plot - t = np.trapz(c / dx / N, x) - print(t) - ''' - - -def test_gridcount_3D(): - ''' - N = 20 - data = np.random.rayleigh(1, size=(3, N)) - >>> data = np.array([[ - ... 0.932896 , 0.89522635, 0.80636346, 1.32283371, 0.27125435, - ... 1.91666304, 2.30736635, 1.13662384, 1.73071287, 1.06061127, - ... 0.99598512, 2.16396591, 1.23458213, 1.12406686, 1.16930431, - ... 0.73700592, 1.21135139, 0.46671506, 1.3530304 , 0.91419104], - ... [ 0.62759088, 0.23988169, 2.04909823, 0.93766571, 1.19343762, - ... 1.94954931, 0.84687514, 0.49284897, 1.05066204, 1.89088505, - ... 0.840738 , 1.02901457, 1.0758625 , 1.76357967, 0.45792897, - ... 1.54488066, 0.17644313, 1.6798871 , 0.72583514, 2.22087245], - ... [ 1.69496432, 0.81791905, 0.82534709, 0.71642389, 0.89294732, - ... 1.66888649, 0.69036947, 0.99961448, 0.30657267, 0.98798713, - ... 0.83298728, 1.83334948, 1.90144186, 1.25781913, 0.07122458, - ... 2.42340852, 2.41342037, 0.87233305, 1.17537114, 1.69505988]]) - - >>> x = np.linspace(0, max(data.ravel()) + 1, 3) - >>> dx = x[1] - x[0] - >>> X = np.vstack((x, x, x)) - >>> c = wk.gridcount(data, X) - >>> np.allclose(c, - ... [[[ 8.74229894e-01, 1.27910940e+00, 1.42033973e-01], - ... [ 1.94778915e+00, 2.59536282e+00, 3.28213680e-01], - ... [ 1.08429416e-01, 1.69571495e-01, 7.48896775e-03]], - ... [[ 1.44969128e+00, 2.58396370e+00, 2.45459949e-01], - ... [ 2.28951650e+00, 4.49653348e+00, 2.73167915e-01], - ... [ 1.10905565e-01, 3.18733817e-01, 1.12880816e-02]], - ... [[ 7.49265424e-02, 2.18142488e-01, 0.00000000e+00], - ... [ 8.53886762e-02, 3.73415131e-01, 0.00000000e+00], - ... [ 4.16196568e-04, 1.62218824e-02, 0.00000000e+00]]]) - True - ''' - - -def test_gridcount_4D(): - ''' - N = 20 - data = np.random.rayleigh(1, size=(2, N)) - >>> data = array([[ - ... 0.38103275, 0.35083136, 0.90024207, 1.88230239, 0.96815399, - ... 0.57392873, 1.63367908, 1.20944125, 2.03887811, 0.81789145], - ... [ 0.69302049, 1.40856592, 0.92156032, 2.14791432, 2.04373821, - ... 0.69800708, 0.58428735, 1.59128776, 2.05771405, 0.87021964], - ... [ 1.44080694, 0.39973751, 1.331243 , 2.48895822, 1.18894158, - ... 1.40526085, 1.01967897, 0.81196474, 1.37978932, 2.03334689], - ... [ 0.870329 , 1.25106862, 0.5346619 , 0.47541236, 1.51930093, - ... 0.58861519, 1.19780448, 0.81548296, 1.56859488, 1.60653533]]) - - >>> x = np.linspace(0, max(data.ravel()) + 1, 3) - >>> dx = x[1] - x[0] - >>> X = np.vstack((x, x, x, x)) - >>> c = wk.gridcount(data, X) - >>> np.allclose(c, - ... [[[[ 1.77163904e-01, 1.87720108e-01, 0.00000000e+00], - ... [ 5.72573585e-01, 6.09557834e-01, 0.00000000e+00], - ... [ 3.48549923e-03, 4.05931870e-02, 0.00000000e+00]], - ... [[ 1.83770124e-01, 2.56357594e-01, 0.00000000e+00], - ... [ 4.35845892e-01, 6.14958970e-01, 0.00000000e+00], - ... [ 3.07662204e-03, 3.58312786e-02, 0.00000000e+00]], - ... [[ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], - ... [ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], - ... [ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]]], - ... [[[ 3.41883175e-01, 5.97977973e-01, 0.00000000e+00], - ... [ 5.72071865e-01, 8.58566538e-01, 0.00000000e+00], - ... [ 3.46939323e-03, 4.04056116e-02, 0.00000000e+00]], - ... [[ 3.58861043e-01, 6.28962785e-01, 0.00000000e+00], - ... [ 8.80697705e-01, 1.47373158e+00, 0.00000000e+00], - ... [ 2.22868504e-01, 1.18008528e-01, 0.00000000e+00]], - ... [[ 2.91835067e-03, 2.60268355e-02, 0.00000000e+00], - ... [ 3.63686503e-02, 1.07959459e-01, 0.00000000e+00], - ... [ 1.88555613e-02, 7.06358976e-03, 0.00000000e+00]]], - ... [[[ 3.13810608e-03, 2.11731327e-02, 0.00000000e+00], - ... [ 6.71606255e-03, 4.53139824e-02, 0.00000000e+00], - ... [ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]], - ... [[ 7.05946179e-03, 5.44614852e-02, 0.00000000e+00], - ... [ 1.09099593e-01, 1.95935584e-01, 0.00000000e+00], - ... [ 6.61257395e-02, 2.47717418e-02, 0.00000000e+00]], - ... [[ 6.38695629e-04, 5.69610302e-03, 0.00000000e+00], - ... [ 1.00358265e-02, 2.44053065e-02, 0.00000000e+00], - ... [ 5.67244468e-03, 2.12498697e-03, 0.00000000e+00]]]]) - True - - h = plb.plot(x, c, '.') # 1D histogram - - h1 = plb.plot(x, c / dx / N) # 1D probability density plot - t = np.trapz(x, c / dx / N) - print(t) - ''' - - -def test_docstrings(): - import doctest - print('Testing docstrings in %s' % __file__) - doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) - -if __name__ == '__main__': - test_docstrings() +from __future__ import division +import unittest +import numpy as np +from numpy.testing import assert_allclose +from numpy import array, inf +import wafo.kdetools as wk + + +class TestKdeTools(unittest.TestCase): + + def setUp(self): + + # N = 20 + # data = np.random.rayleigh(1, size=(N,)) + self.data = array([0.75355792, 0.72779194, 0.94149169, 0.07841119, + 2.32291887, 1.10419995, 0.77055114, 0.60288273, + 1.36883635, 1.74754326, 1.09547561, 1.01671133, + 0.73211143, 0.61891719, 0.75903487, 1.8919469, + 0.72433808, 1.92973094, 0.44749838, 1.36508452]) + self.x = np.linspace(0, max(self.data) + 1, 10) + + def test0_KDE1D(self): + data, x = self.data, self.x + # kde = wk.KDE(data, hs=0.5, alpha=0.5) + + kde0 = wk.KDE(data, hs=0.5, alpha=0.0, inc=16) + + fx = kde0.eval_grid(x) + assert_allclose(fx, [0.2039735, 0.40252503, 0.54595078, + 0.52219649, 0.3906213, 0.26381501, 0.16407362, + 0.08270612, 0.02991145, 0.00720821]) + + fx = kde0.eval_grid(x, r=1) + + assert_allclose(-fx, [0.11911419724002906, 0.13440000694772541, + 0.044400116190638696, -0.0677695267531197, + -0.09555596523854318, -0.07498819087690148, + -0.06167607128369182, -0.04678588231996062, + -0.024515979196411814, -0.008022010381009501]) + + fx = kde0.eval_grid(x, r=2) + assert_allclose(fx, [0.08728138131197069, 0.07558648034784508, + 0.05093715852686607, 0.07908624791267539, + 0.10495675573359599, 0.07916167222333347, + 0.048168330179460386, 0.03438361415806721, + 0.02197927811015286, 0.009222988165160621]) + + ffx = kde0.eval_grid_fast(x) + assert_allclose(ffx, [0.20729484, 0.39865044, 0.53716945, 0.5169322, + 0.39060223, 0.26441126, 0.16388801, 0.08388527, + 0.03227164, 0.00883579], 1e-6) + + fx = kde0.eval_grid_fast(x, r=1) + assert_allclose(fx, [-0.11582450668441863, -0.12901768780183628, + -0.04402464127812092, 0.0636190549560749, + 0.09345144501310157, 0.07573621607126926, + 0.06149475587201987, 0.04550210608639078, + 0.024427027615689087, 0.00885576504750473]) + + fx = kde0.eval_grid_fast(x, r=2) + assert_allclose(fx, [0.08499284131672676, 0.07572564161758065, + 0.05329987919556978, 0.07849796347259348, + 0.10232741197885842, 0.07869015379158453, + 0.049431823916945394, 0.034527256372343613, + 0.021517998409663567, 0.009527401063843402]) + + f = kde0.eval_grid_fast() + assert_allclose(np.trapz(f, kde0.args), 0.995001) + assert_allclose(f, [0.011494108953097538, 0.0348546729842836, + 0.08799292403553607, 0.18568717590587996, + 0.32473136104523725, 0.46543163412700084, + 0.5453201564089711, 0.5300582814373698, + 0.44447650672207173, 0.3411961246641896, + 0.25103852230993573, 0.17549519961525845, + 0.11072988772879173, 0.05992730870218242, + 0.02687783924833738, 0.00974982785617795]) + + def skiptest0_KDEgauss_1D(self): + data, x = self.data, self.x + # kde = wk.KDE(data, hs=0.5, alpha=0.5) + + kde0 = wk.KDEgauss(data, hs=0.5, alpha=0.0, inc=16) + + fx = kde0.eval_grid(x) + assert_allclose(fx, [0.2039735, 0.40252503, 0.54595078, + 0.52219649, 0.3906213, 0.26381501, 0.16407362, + 0.08270612, 0.02991145, 0.00720821]) + + fx = kde0.eval_grid(x, r=1) + + assert_allclose(-fx, [0.11911419724002906, 0.13440000694772541, + 0.044400116190638696, -0.0677695267531197, + -0.09555596523854318, -0.07498819087690148, + -0.06167607128369182, -0.04678588231996062, + -0.024515979196411814, -0.008022010381009501]) + + fx = kde0.eval_grid(x, r=2) + assert_allclose(fx, [0.08728138131197069, 0.07558648034784508, + 0.05093715852686607, 0.07908624791267539, + 0.10495675573359599, 0.07916167222333347, + 0.048168330179460386, 0.03438361415806721, + 0.02197927811015286, 0.009222988165160621]) + + ffx = kde0.eval_grid_fast(x) + # print(ffx.tolist()) + assert_allclose(ffx, [0.20729484, 0.39865044, 0.53716945, 0.5169322, + 0.39060223, 0.26441126, 0.16388801, 0.08388527, + 0.03227164, 0.00883579], 1e-6) + + fx = kde0.eval_grid_fast(x, r=1) + assert_allclose(fx, [-0.11582450668441863, -0.12901768780183628, + -0.04402464127812092, 0.0636190549560749, + 0.09345144501310157, 0.07573621607126926, + 0.06149475587201987, 0.04550210608639078, + 0.024427027615689087, 0.00885576504750473]) + + fx = kde0.eval_grid_fast(x, r=2) + assert_allclose(fx, [0.08499284131672676, 0.07572564161758065, + 0.05329987919556978, 0.07849796347259348, + 0.10232741197885842, 0.07869015379158453, + 0.049431823916945394, 0.034527256372343613, + 0.021517998409663567, 0.009527401063843402]) + + f = kde0.eval_grid_fast() + assert_allclose(f, [0.06807544, 0.12949095, 0.21985421, 0.33178031, + 0.44334874, 0.52429234, 0.55140336, 0.52221323, + 0.45500674, 0.3752208, 0.30046799, 0.235667, + 0.17854402, 0.12721305, 0.08301993, 0.04862324]) + assert_allclose(np.trapz(f, kde0.args), 0.96716261) + + def test1_TKDE1D(self): + data = self.data + x = np.linspace(0.01, max(data) + 1, 10) + kde = wk.TKDE(data, hs=0.5, L2=0.5) + f = kde(x) + assert_allclose(f, [1.03982714, 0.45839018, 0.39514782, 0.32860602, + 0.26433318, 0.20717946, 0.15907684, 0.1201074, + 0.08941027, 0.06574882]) + assert_allclose(np.trapz(f, x), 0.94787730659349068) + f = kde.eval_grid_fast(x) + assert_allclose(f, [1.0401892415290148, 0.45838973393693677, + 0.39514689240671547, 0.32860531818532457, + 0.2643330110605783, 0.20717975528556506, + 0.15907696844388747, 0.12010770443337843, + 0.08941129458260941, 0.06574899139165799]) + f = kde.eval_grid_fast2(x) + assert_allclose(f, [1.0401892415290148, 0.45838973393693677, + 0.39514689240671547, 0.32860531818532457, + 0.2643330110605783, 0.20717975528556506, + 0.15907696844388747, 0.12010770443337843, + 0.08941129458260941, 0.06574899139165799]) + assert_allclose(np.trapz(f, x), 0.9479438058416647) + + def test1_KDE1D(self): + data, x = self.data, self.x + kde = wk.KDE(data, hs=0.5) + f = kde(x) + assert_allclose(f, [0.2039735, 0.40252503, 0.54595078, 0.52219649, + 0.3906213, 0.26381501, 0.16407362, 0.08270612, + 0.02991145, 0.00720821]) + + assert_allclose(np.trapz(f, x), 0.92576174424281876) + + def test2_KDE1D(self): + # data, x = self.data, self.x + + data = np.asarray([1, 2]) + x = np.linspace(0, max(np.ravel(data)) + 1, 10) + kde = wk.KDE(data, hs=0.5) + f = kde(x) + assert_allclose(f, [0.0541248, 0.16555235, 0.33084399, 0.45293325, + 0.48345808, 0.48345808, 0.45293325, 0.33084399, + 0.16555235, 0.0541248]) + + assert_allclose(np.trapz(f, x), 0.97323338046725172) + + def test1a_KDE1D(self): + data, x = self.data, self.x + kde = wk.KDE(data, hs=0.5, alpha=0.5) + f = kde(x) + assert_allclose(f, [0.17252055, 0.41014271, 0.61349072, 0.57023834, + 0.37198073, 0.21409279, 0.12738463, 0.07460326, + 0.03956191, 0.01887164]) + + assert_allclose(np.trapz(f, x), 0.92938023659047952) + + def test2a_KDE1D(self): + # data, x = self.data, self.x + data = np.asarray([1, 2]) + x = np.linspace(0, max(np.ravel(data)) + 1, 10) + kde = wk.KDE(data, hs=0.5, alpha=0.5) + f = kde(x) + assert_allclose(f, [0.0541248, 0.16555235, 0.33084399, 0.45293325, + 0.48345808, 0.48345808, 0.45293325, 0.33084399, + 0.16555235, 0.0541248]) + + assert_allclose(np.trapz(f, x), 0.97323338046725172) + + def test_KDE2D(self): + # N = 20 + # data = np.random.rayleigh(1, size=(2, N)) + data = array([ + [0.38103275, 0.35083136, 0.90024207, 1.88230239, 0.96815399, + 0.57392873, 1.63367908, 1.20944125, 2.03887811, 0.81789145, + 0.69302049, 1.40856592, 0.92156032, 2.14791432, 2.04373821, + 0.69800708, 0.58428735, 1.59128776, 2.05771405, 0.87021964], + [1.44080694, 0.39973751, 1.331243, 2.48895822, 1.18894158, + 1.40526085, 1.01967897, 0.81196474, 1.37978932, 2.03334689, + 0.870329, 1.25106862, 0.5346619, 0.47541236, 1.51930093, + 0.58861519, 1.19780448, 0.81548296, 1.56859488, 1.60653533]]) + + x = np.linspace(0, max(np.ravel(data)) + 1, 3) + + kde0 = wk.KDE(data, hs=0.5, alpha=0.0, inc=512) + + assert_allclose(kde0.eval_grid(x, x), + [[3.27260963e-02, 4.21654678e-02, 5.85338634e-04], + [6.78845466e-02, 1.42195839e-01, 1.41676003e-03], + [1.39466746e-04, 4.26983850e-03, 2.52736185e-05]]) + + t = [[0.0443506097653615, 0.06433530873456418, 0.0041353838654317856], + [0.07218297149063724, 0.1235819591878892, 0.009288890372002473], + [0.001613328022214066, 0.00794857884864038, 0.0005874786787715641] + ] + assert_allclose(kde0.eval_grid_fast(x, x), t) + + def test_gridcount_1D(self): + data, x = self.data, self.x + dx = x[1] - x[0] + c = wk.gridcount(data, x) + assert_allclose(c, [0.78762626, 1.77520717, 7.99190087, 4.04054449, + 1.67156643, 2.38228499, 1.05933195, 0.29153785, 0., + 0.]) + t = np.trapz(c / dx / len(data), x) + assert_allclose(t, 0.9803093435140049) + + def test_gridcount_2D(self): + N = 20 + # data = np.random.rayleigh(1, size=(2, N)) + data = array([ + [0.38103275, 0.35083136, 0.90024207, 1.88230239, 0.96815399, + 0.57392873, 1.63367908, 1.20944125, 2.03887811, 0.81789145, + 0.69302049, 1.40856592, 0.92156032, 2.14791432, 2.04373821, + 0.69800708, 0.58428735, 1.59128776, 2.05771405, 0.87021964], + [1.44080694, 0.39973751, 1.331243, 2.48895822, 1.18894158, + 1.40526085, 1.01967897, 0.81196474, 1.37978932, 2.03334689, + 0.870329, 1.25106862, 0.5346619, 0.47541236, 1.51930093, + 0.58861519, 1.19780448, 0.81548296, 1.56859488, 1.60653533]]) + + x = np.linspace(0, max(np.ravel(data)) + 1, 5) + dx = x[1] - x[0] + X = np.vstack((x, x)) + c = wk.gridcount(data, X) + assert_allclose(c, + [[0.38922806, 0.8987982, 0.34676493, 0.21042807, 0.], + [1.15012203, 5.16513541, 3.19250588, 0.55420752, 0.], + [0.74293418, 3.42517219, 1.97923195, 0.76076621, 0.], + [0.02063536, 0.31054405, 0.71865964, 0.13486633, 0.], + [0., 0., 0., 0., 0.]], 1e-5) + + t = np.trapz(np.trapz(c / (dx**2 * N), x), x) + assert_allclose(t, 0.9011618785736376) + + def test_gridcount_3D(self): + N = 20 + # data = np.random.rayleigh(1, size=(3, N)) + data = np.array([ + [0.932896, 0.89522635, 0.80636346, 1.32283371, 0.27125435, + 1.91666304, 2.30736635, 1.13662384, 1.73071287, 1.06061127, + 0.99598512, 2.16396591, 1.23458213, 1.12406686, 1.16930431, + 0.73700592, 1.21135139, 0.46671506, 1.3530304, 0.91419104], + [0.62759088, 0.23988169, 2.04909823, 0.93766571, 1.19343762, + 1.94954931, 0.84687514, 0.49284897, 1.05066204, 1.89088505, + 0.840738, 1.02901457, 1.0758625, 1.76357967, 0.45792897, + 1.54488066, 0.17644313, 1.6798871, 0.72583514, 2.22087245], + [1.69496432, 0.81791905, 0.82534709, 0.71642389, 0.89294732, + 1.66888649, 0.69036947, 0.99961448, 0.30657267, 0.98798713, + 0.83298728, 1.83334948, 1.90144186, 1.25781913, 0.07122458, + 2.42340852, 2.41342037, 0.87233305, 1.17537114, 1.69505988]]) + + x = np.linspace(0, max(np.ravel(data)) + 1, 3) + dx = x[1] - x[0] + X = np.vstack((x, x, x)) + c = wk.gridcount(data, X) + assert_allclose(c, + [[[8.74229894e-01, 1.27910940e+00, 1.42033973e-01], + [1.94778915e+00, 2.59536282e+00, 3.28213680e-01], + [1.08429416e-01, 1.69571495e-01, 7.48896775e-03]], + [[1.44969128e+00, 2.58396370e+00, 2.45459949e-01], + [2.28951650e+00, 4.49653348e+00, 2.73167915e-01], + [1.10905565e-01, 3.18733817e-01, 1.12880816e-02]], + [[7.49265424e-02, 2.18142488e-01, 0.0], + [8.53886762e-02, 3.73415131e-01, 0.0], + [4.16196568e-04, 1.62218824e-02, 0.0]]]) + + t = np.trapz(np.trapz(np.trapz(c / dx**3 / N, x), x), x) + assert_allclose(t, 0.5164999727560187) + + def test_gridcount_4D(self): + + N = 20 + # data = np.random.rayleigh(1, size=(2, N)) + data = array([ + [0.38103275, 0.35083136, 0.90024207, 1.88230239, 0.96815399, + 0.57392873, 1.63367908, 1.20944125, 2.03887811, 0.81789145], + [0.69302049, 1.40856592, 0.92156032, 2.14791432, 2.04373821, + 0.69800708, 0.58428735, 1.59128776, 2.05771405, 0.87021964], + [1.44080694, 0.39973751, 1.331243, 2.48895822, 1.18894158, + 1.40526085, 1.01967897, 0.81196474, 1.37978932, 2.03334689], + [0.870329, 1.25106862, 0.5346619, 0.47541236, 1.51930093, + 0.58861519, 1.19780448, 0.81548296, 1.56859488, 1.60653533]]) + + x = np.linspace(0, max(np.ravel(data)) + 1, 3) + dx = x[1] - x[0] + X = np.vstack((x, x, x, x)) + c = wk.gridcount(data, X) + assert_allclose(c, + [[[[1.77163904e-01, 1.87720108e-01, 0.0], + [5.72573585e-01, 6.09557834e-01, 0.0], + [3.48549923e-03, 4.05931870e-02, 0.0]], + [[1.83770124e-01, 2.56357594e-01, 0.0], + [4.35845892e-01, 6.14958970e-01, 0.0], + [3.07662204e-03, 3.58312786e-02, 0.0]], + [[0.0, 0.0, 0.0], + [0.0, 0.0, 0.0], + [0.0, 0.0, 0.0]]], + [[[3.41883175e-01, 5.97977973e-01, 0.0], + [5.72071865e-01, 8.58566538e-01, 0.0], + [3.46939323e-03, 4.04056116e-02, 0.0]], + [[3.58861043e-01, 6.28962785e-01, 0.0], + [8.80697705e-01, 1.47373158e+00, 0.0], + [2.22868504e-01, 1.18008528e-01, 0.0]], + [[2.91835067e-03, 2.60268355e-02, 0.0], + [3.63686503e-02, 1.07959459e-01, 0.0], + [1.88555613e-02, 7.06358976e-03, 0.0]]], + [[[3.13810608e-03, 2.11731327e-02, 0.0], + [6.71606255e-03, 4.53139824e-02, 0.0], + [0.0, 0.0, 0.0]], + [[7.05946179e-03, 5.44614852e-02, 0.0], + [1.09099593e-01, 1.95935584e-01, 0.0], + [6.61257395e-02, 2.47717418e-02, 0.0]], + [[6.38695629e-04, 5.69610302e-03, 0.0], + [1.00358265e-02, 2.44053065e-02, 0.0], + [5.67244468e-03, 2.12498697e-03, 0.0]]]]) + + t = np.trapz(np.trapz(np.trapz(np.trapz(c / dx**4 / N, x), x), x), x) + assert_allclose(t, 0.21183518274521254) + + +class TestKernels(unittest.TestCase): + def setUp(self): + self.names = ['epanechnikov', 'biweight', 'triweight', 'logistic', + 'p1epanechnikov', 'p1biweight', 'p1triweight', + 'triangular', 'gaussian', 'rectangular', 'laplace'] + + def test_stats(self): + truth = { + 'biweight': (0.14285714285714285, 0.7142857142857143, 22.5), + 'logistic': (3.289868133696453, 1./6, 0.023809523809523808), + 'p1biweight': (0.14285714285714285, 0.7142857142857143, 22.5), + 'triangular': (0.16666666666666666, 0.6666666666666666, inf), + 'gaussian': (1, 0.28209479177387814, 0.21157109383040862), + 'epanechnikov': (0.2, 0.6, inf), + 'triweight': (0.1111111111111111, 0.8158508158508159, inf), + 'p1triweight': (0.1111111111111111, 0.8158508158508159, inf), + 'p1epanechnikov': (0.2, 0.6, inf), + 'rectangular': (0.3333333333333333, 0.5, inf), + 'laplace': (2, 0.25, inf)} + for name in self.names: + kernel = wk.Kernel(name) + assert_allclose(kernel.stats(), truth[name]) + # truth[name] = kernel.stats() + # print(truth) + + def test_norm_factors_1d(self): + truth = { + 'biweight': 1.0666666666666667, 'logistic': 1.0, + 'p1biweight': 1.0666666666666667, 'triangular': 1.0, + 'gaussian': 2.5066282746310002, 'epanechnikov': 1.3333333333333333, + 'triweight': 0.91428571428571426, 'laplace': 2, + 'p1triweight': 0.91428571428571426, + 'p1epanechnikov': 1.3333333333333333, 'rectangular': 2.0} + for name in self.names: + kernel = wk.Kernel(name) + assert_allclose(kernel.norm_factor(d=1, n=20), truth[name]) + # truth[name] = kernel.norm_factor(d=1, n=20) + + def test_effective_support(self): + truth = {'biweight': (-1.0, 1.0), 'logistic': (-7.0, 7.0), + 'p1biweight': (-1.0, 1.0), 'triangular': (-1.0, 1.0), + 'gaussian': (-4.0, 4.0), 'epanechnikov': (-1.0, 1.0), + 'triweight': (-1.0, 1.0), 'p1triweight': (-1.0, 1.0), + 'p1epanechnikov': (-1.0, 1.0), 'rectangular': (-1.0, 1.0), + 'laplace': (-7.0, 7.0)} + for name in self.names: + kernel = wk.Kernel(name) + assert_allclose(kernel.effective_support(), truth[name]) + # truth[name] = kernel.effective_support() + # print(truth) + # self.assertTrue(False) + + def test_that_kernel_is_a_pdf(self): + + for name in self.names: + kernel = wk.Kernel(name) + xmin, xmax = kernel.effective_support() + x = np.linspace(xmin, xmax, 4*1024+1) + m0 = kernel.norm_factor(d=1, n=1) + pdf = kernel(x)/m0 + # print(name) + # print(pdf[0], pdf[-1]) + # print(np.trapz(pdf, x) - 1) + assert_allclose(np.trapz(pdf, x), 1, 1e-2) + # self.assertTrue(False) + + +class TestSmoothing(unittest.TestCase): + def setUp(self): + self.data = np.array([ + [0.932896, 0.89522635, 0.80636346, 1.32283371, 0.27125435, + 1.91666304, 2.30736635, 1.13662384, 1.73071287, 1.06061127, + 0.99598512, 2.16396591, 1.23458213, 1.12406686, 1.16930431, + 0.73700592, 1.21135139, 0.46671506, 1.3530304, 0.91419104], + [0.62759088, 0.23988169, 2.04909823, 0.93766571, 1.19343762, + 1.94954931, 0.84687514, 0.49284897, 1.05066204, 1.89088505, + 0.840738, 1.02901457, 1.0758625, 1.76357967, 0.45792897, + 1.54488066, 0.17644313, 1.6798871, 0.72583514, 2.22087245], + [1.69496432, 0.81791905, 0.82534709, 0.71642389, 0.89294732, + 1.66888649, 0.69036947, 0.99961448, 0.30657267, 0.98798713, + 0.83298728, 1.83334948, 1.90144186, 1.25781913, 0.07122458, + 2.42340852, 2.41342037, 0.87233305, 1.17537114, 1.69505988]]) + self.gauss = wk.Kernel('gaussian') + + def test_hns(self): + hs = self.gauss.hns(self.data) + assert_allclose(hs, [0.18154437, 0.36207987, 0.37396219]) + + def test_hos(self): + hs = self.gauss.hos(self.data) + assert_allclose(hs, [0.195209, 0.3893332, 0.40210988]) + + def test_hms(self): + hs = self.gauss.hmns(self.data) + assert_allclose(hs, [[3.25196193e-01, -2.68892467e-02, 3.18932448e-04], + [-2.68892467e-02, 3.91283306e-01, 2.38654678e-02], + [3.18932448e-04, 2.38654678e-02, 4.05123874e-01]]) + + def test_hscv(self): + hs = self.gauss.hscv(self.data) + assert_allclose(hs, [0.16858959, 0.32739383, 0.3046287]) + + def test_hstt(self): + hs = self.gauss.hstt(self.data) + assert_allclose(hs, [0.18099075, 0.50409881, 0.11018912]) + + def test_hste(self): + hs = self.gauss.hste(self.data) + assert_allclose(hs, [0.16750009, 0.29059113, 0.17994255]) + + def test_hldpi(self): + hs = self.gauss.hldpi(self.data) + assert_allclose(hs, [0.1732289, 0.33159097, 0.3107633]) + + def test_hisj(self): + hs = self.gauss.hisj(self.data) + assert_allclose(hs, [0.29542502, 0.74277133, 0.51899114]) + +if __name__ == "__main__": + # import sys;sys.argv = ['', 'Test.testName'] + unittest.main()