from __future__ import absolute_import, division import numpy as np from numpy import (pi, linalg, concatenate, sqrt) from scipy.sparse import spdiags from scipy.sparse.linalg import spsolve import scipy.optimize as optimize from scipy.signal import _savitzky_golay from scipy.ndimage import convolve1d from scipy.ndimage.morphology import distance_transform_edt import warnings from wafo.dctpack import dctn, idctn __all__ = ['SavitzkyGolay', 'Kalman', 'HodrickPrescott', 'smoothn', 'HampelFilter', 'SmoothNd'] # noise = np.random.randn(2**8)/10 noise = [-0.0490483773397234, 0.07101522794824691, 0.043129450693516064, 0.07858516767729644, -0.04489848540755172, -0.012710090966021995, 0.022967442347004003, -0.1593564930543959, 0.14752458454255937, -0.1220055819473534, -0.030151822649201642, 0.009880871420067841, 0.0401050562035102, -0.10931262882008379, -0.14550620919429919, -0.06632845063372966, 0.07773893951749064, -0.009527784302072342, 0.06002486046176557, 0.11972670522904964, -0.14436696992162384, 0.06009486605688445, -0.05802790838575894, 0.16964239368289297, 0.09088881573238144, -0.003398259264109856, 0.059830811447018004, -0.08189024981767952, -0.05455483548325317, 0.056651518536760745, -0.05211609539593189, -0.07848323826083178, -0.03921692262168154, -0.04755275276447492, -0.05855172473750038, 0.06480280696345982, -0.05237889271019207, -0.05891912551792037, -0.04045907452295067, -0.09058522124919187, 0.1406515441218336, 0.15557979603588584, -0.09096515320242772, 0.1724190189462715, -0.04978942687488187, -0.0855435866249914, 0.09439718859306868, -0.14758639479507882, -0.07225230856508442, 0.008364508824556314, 0.06704423745152435, -0.01718113731784587, 0.07473943576290255, 0.028133087670974395, 0.026270590730899095, 0.13175770484080895, -0.01821821552644416, 0.11325945472394446, 0.04694754851273185, -0.23899404962137366, -0.1528175431702195, 0.151870532421663, -0.07353204927616248, 0.11604199430172217, -0.09111623325687843, -0.11887366073405607, -0.029872397510562025, 0.047672685028458936, -0.18340065977268627, 0.06896217941210328, 0.042997912112300564, 0.15416998299846174, -0.0386283794526545, 0.14070600624229804, 0.020984623041646142, -0.1892741373898864, 0.03253519397457513, -0.06182705494266229, -0.1326495728975159, 0.026234150321195537, 0.0550541170409239, 0.029275813927566702, 0.042742104678489906, -0.2170004668366198, -0.00035991761313413197, -0.0638872684868346, -0.11769436550364845, -0.017792813824766808, -0.022786402363044914, -0.10668279890162544, 0.05979507681729831, -0.1008100479486818, 0.0703474638610785, 0.1630534776572414, 0.06682406484481357, -0.0527228810042394, -0.046515310355062636, 0.04609515154732255, 0.11503753838360875, 0.11517599661346192, -0.05596425736274815, -0.06149119758833357, 0.10599964719188917, -0.012076380140185552, 0.0436828262270732, -0.03910174791470852, -0.03263251315745414, -0.012513843545007558, 0.004590611827089213, 0.0762719171282112, 0.06497715695411535, -0.003280826953794463, 0.13524154885565484, -0.020441364843140027, -0.09488214173137496, 0.1385755359902911, -0.23883052310744746, -0.10110537386421652, -0.1588981058869149, 0.06645444828058467, -0.2103306051703948, 0.15215327561190056, -0.03582175680076989, 0.013593833383013293, -0.11542058494732854, -0.05613268116816099, 0.012711037661355899, 0.04242805633100794, -0.011799315325220794, 0.12141794601099387, 0.054285270560662645, 0.07549385527022169, -0.04549437694653443, 0.11009856942530691, 0.05233482224379645, -0.042246830306136955, -0.1737197924666796, -0.10589427330127077, 0.04895472597843757, 0.06756519832636187, 0.083376600742245, -0.07502859751328732, -0.09493802498812245, -0.01058967186080922, -0.23759763247649018, 0.08439637862616411, -0.2021754550870607, 0.07365816800912013, 0.07435401663661081, 0.047992791325423556, -0.005250092450514997, 0.1693610927865244, 0.030338113772413154, -0.18010537945928004, 0.01744129379023785, 0.1902505975745975, -0.004598733688659104, 0.13663542585715657, -0.04100719174496187, -0.15406303185009937, -0.05297118247908407, 0.04435144348234146, 0.022377061632995063, 0.05491057192661079, -0.08473062163887303, -0.03907641665824873, 0.008686833182075315, -0.06053451866471732, -0.051735892949367854, -0.1902071038920444, 0.11508817132666356, 0.08903045262390544, -0.028537865059606825, -0.07160660523436188, 0.05994760363400714, 0.03637820115278829, 0.027604828657436364, 0.04168122074675033, -0.021707671111253164, 0.06770739385070886, -0.04848505599153394, -0.14377853380839264, 0.17448368721141166, -0.05972663746675887, -0.1615729579782888, -0.09508063624538736, -0.05501964872264433, -0.14370852991216054, -0.1025241548369181, -0.14751000180775747, -0.05402976681470177, -0.05847606145915367, 0.015603559358987138, 0.040327317968149784, 0.015596571983936361, 0.08721780106901023, 0.13669912032986667, -0.07070030973798198, 0.04821782065785363, 0.05266507025196321, -0.013775127999269254, 0.07032239356769251, 0.04685048562398681, 0.004648720572365418, -0.19364418622487742, 0.013662994215276983, 0.04703494294810789, 0.04863794676207257, -0.09883919097676001, -0.004798538894290822, -0.22183503742087135, 0.062096556899520906, 0.07098373434409047, -0.05335639719762188, -0.09150459514627822, -0.1329311651202703, -0.037376442133682145, 0.1238732233009325, -0.01232052797514208, 0.007151238520555889, -0.04772828461473576, -0.029830395387364726, -0.03277336781995001, 0.09964048194066656, 0.09306408040020697, -0.03761782769337173, 0.07059549032551317, -0.15490333414875848, 0.12599077783991805, 0.23520519946427365, 0.021640305946603107, 0.014851729969403227, -0.039035437601777224, -0.12087588583684257, -0.07207855860199022, -0.002800081649022032, 0.2543907308881692, -0.07966223382328289, -0.1014419766425384, -0.11243061225437859, -0.08744845956375621, -0.05540140267769189, -0.04995531421885231, -0.13274847220288336, 0.06435474943034288, 0.015640361472736924, -0.11210644205346465, -0.04080648821849449, -0.011452694652695428, 0.22044736923317904, 0.024322228245949113, 0.09622705616884256, 0.05793212184654495, -0.10620553812614748, 0.06762504431789758, 0.19135075519983785] # @IgnorePep8 def _assert(cond, msg): if not cond: raise ValueError(msg) class SavitzkyGolay(object): r"""Smooth and optionally differentiate data with a Savitzky-Golay filter. The Savitzky-Golay filter removes high frequency noise from data. It has the advantage of preserving the original shape and features of the signal better than other types of filtering approaches, such as moving averages techniques. Parameters ---------- n : int the size of the smoothing window is 2*n+1. degree : int the degree of the polynomial used in the filtering. Must be less than `window_size` - 1, i.e, less than 2*n. diff_order : int order of the derivative to compute (default = 0 means only smoothing) 0 means that filter results in smoothing of function 1 means that filter results in smoothing the first derivative of the function and so on ... delta : float, optional The spacing of the samples to which the filter will be applied. This is only used if deriv > 0. Default is 1.0. axis : int, optional The axis of the array `x` along which the filter is to be applied. Default is -1. mode : str, optional Must be 'mirror', 'constant', 'nearest', 'wrap' or 'interp'. This determines the type of extension to use for the padded signal to which the filter is applied. When `mode` is 'constant', the padding value is given by `cval`. See the Notes for more details on 'mirror', 'constant', 'wrap', and 'nearest'. When the 'interp' mode is selected (the default), no extension is used. Instead, a degree `polyorder` polynomial is fit to the last `window_length` values of the edges, and this polynomial is used to evaluate the last `window_length // 2` output values. cval : scalar, optional Value to fill past the edges of the input if `mode` is 'constant'. Default is 0.0. Notes ----- The Savitzky-Golay is a type of low-pass filter, particularly suited for smoothing noisy data. The main idea behind this approach is to make for each point a least-square fit with a polynomial of high order over a odd-sized window centered at the point. Details on the `mode` options: 'mirror': Repeats the values at the edges in reverse order. The value closest to the edge is not included. 'nearest': The extension contains the nearest input value. 'constant': The extension contains the value given by the `cval` argument. 'wrap': The extension contains the values from the other end of the array. For example, if the input is [1, 2, 3, 4, 5, 6, 7, 8], and `window_length` is 7, the following shows the extended data for the various `mode` options (assuming `cval` is 0):: mode | Ext | Input | Ext -----------+---------+------------------------+--------- 'mirror' | 4 3 2 | 1 2 3 4 5 6 7 8 | 7 6 5 'nearest' | 1 1 1 | 1 2 3 4 5 6 7 8 | 8 8 8 'constant' | 0 0 0 | 1 2 3 4 5 6 7 8 | 0 0 0 'wrap' | 6 7 8 | 1 2 3 4 5 6 7 8 | 1 2 3 Examples -------- >>> import wafo.sg_filter as ws >>> t = np.linspace(-4, 4, 500) >>> noise = np.random.normal(0, 0.05, t.shape) >>> noise = np.sqrt(0.05)*np.sin(100*t) >>> y = np.exp( -t**2 ) + noise >>> ysg = SavitzkyGolay(n=20, degree=2).smooth(y) >>> np.allclose(ysg[:3], [ 0.01345312, 0.01164172, 0.00992839]) True >>> ysm = SavitzkyGolay(n=20, degree=2, mode='mirror').smooth(y) >>> np.allclose(ysm[:3], [-0.01604804, -0.00592883, 0.0035858 ]) True >>> ysc = SavitzkyGolay(n=20, degree=2, mode='constant').smooth(y) >>> np.allclose(ysc[:3], [-0.00279797, 0.00519541, 0.00666146]) True >>> ysn = SavitzkyGolay(n=20, degree=2, mode='nearest').smooth(y) >>> np.allclose(ysn[:3], [ 0.08711171, 0.0846945 , 0.07587448]) True >>> ysw = SavitzkyGolay(n=20, degree=2, mode='wrap').smooth(y) >>> np.allclose(ysw[:3], [-0.00208422, -0.00201491, 0.00201772]) True >>> np.allclose(SavitzkyGolay(n=20, degree=2).smooth_last(y), ... 0.004921382626100505) True import matplotlib.pyplot as plt h = plt.plot(t, y, label='Noisy signal') h1 = plt.plot(t, np.exp(-t**2), 'k', lw=1.5, label='Original signal') h2 = plt.plot(t, ysg, 'r', label='Filtered signal') h3 = plt.legend() h4 = plt.title('Savitzky-Golay') plt.show() References ---------- .. [1] A. Savitzky, M. J. E. Golay, Smoothing and Differentiation of Data by Simplified Least Squares Procedures. Analytical Chemistry, 1964, 36 (8), pp 1627-1639. .. [2] Numerical Recipes 3rd Edition: The Art of Scientific Computing W.H. Press, S.A. Teukolsky, W.T. Vetterling, B.P. Flannery Cambridge University Press ISBN-13: 9780521880688 """ def __init__(self, n, degree=1, diff_order=0, delta=1.0, axis=-1, mode='interp', cval=0.0): self.n = n self.degree = degree self.diff_order = diff_order self.mode = mode self.cval = cval self.axis = axis self.delta = delta window_length = 2 * n + 1 self._coeff = _savitzky_golay.savgol_coeffs(window_length, degree, deriv=diff_order, delta=delta) def smooth_last(self, signal, k=1): coeff = self._coeff n = (np.size(coeff) - 1) // 2 y = np.squeeze(signal) if n == 0: return y if y.ndim > 1: coeff.shape = (-1, 1) first_vals = y[0] - np.abs(y[n:0:-1] - y[0]) last_vals = y[-1] + np.abs(y[-2:-n - 2:-1] - y[-1]) y = concatenate((first_vals, y, last_vals)) return (y[-2 * n - 1 - k:-k] * coeff).sum(axis=0) def __call__(self, signal): return self.smooth(signal) def smooth(self, signal): dtype = np.result_type(signal, np.float) x = np.asarray(signal, dtype=dtype) coeffs = self._coeff mode, axis = self.mode, self.axis if mode == "interp": window_length, polyorder = self.n * 2 + 1, self.degree deriv, delta = self.diff_order, self.delta y = convolve1d(x, coeffs, axis=axis, mode="constant") _savitzky_golay._fit_edges_polyfit(x, window_length, polyorder, deriv, delta, axis, y) else: y = convolve1d(x, coeffs, axis=axis, mode=mode, cval=self.cval) return y def evar(y): """Noise variance estimation. Assuming that the deterministic function Y has additive Gaussian noise, EVAR(Y) returns an estimated variance of this noise. Note: ---- A thin-plate smoothing spline model is used to smooth Y. It is assumed that the model whose generalized cross-validation score is minimum can provide the variance of the additive noise. A few tests showed that EVAR works very well with "not too irregular" functions. Examples: -------- 1D signal >>> n = 1e6 >>> x = np.linspace(0,100,n); >>> y = np.cos(x/10)+(x/50) >>> var0 = 0.02 # noise variance >>> yn = y + sqrt(var0)*np.random.randn(*y.shape) >>> s = evar(yn) # estimated variance >>> np.abs(s-var0)/var0 < 3.5/np.sqrt(n) True 2D function >>> xp = np.linspace(0,1,50) >>> x, y = np.meshgrid(xp,xp) >>> f = np.exp(x+y) + np.sin((x-2*y)*3) >>> var0 = 0.04 # noise variance >>> fn = f + sqrt(var0)*np.random.randn(*f.shape) >>> s = evar(fn) # estimated variance >>> np.abs(s-var0)/var0 < 3.5/np.sqrt(50) True 3D function >>> yp = np.linspace(-2,2,50) >>> [x,y,z] = np.meshgrid(yp, yp, yp, sparse=True) >>> f = x*np.exp(-x**2-y**2-z**2) >>> var0 = 0.5 # noise variance >>> fn = f + np.sqrt(var0)*np.random.randn(*f.shape) >>> s = evar(fn) # estimated variance >>> np.abs(s-var0)/var0 < 3.5/np.sqrt(50) True Other example ------------- http://www.biomecardio.com/matlab/evar.html Note: ---- EVAR is only adapted to evenly-gridded 1-D to N-D data. See also -------- VAR, STD, SMOOTHN """ # Damien Garcia -- 2008/04, revised 2009/10 y = np.atleast_1d(y) d = y.ndim sh0 = y.shape S = np.zeros(sh0) sh1 = np.ones((d,)) cos = np.cos pi = np.pi for i in range(d): ni = sh0[i] sh1[i] = ni t = np.arange(ni).reshape(sh1) / ni S += cos(pi * t) sh1[i] = 1 S2 = 2 * (d - S).ravel() # N-D Discrete Cosine Transform of Y dcty2 = dctn(y).ravel() ** 2 def score_fun(L, S2, dcty2): # Generalized cross validation score M = 1 - 1. / (1 + 10 ** L * S2) noisevar = (dcty2 * M ** 2).mean() return noisevar / M.mean() ** 2 # fun = lambda x : score_fun(x, S2, dcty2) Lopt = optimize.fminbound(score_fun, -38, 38, args=(S2, dcty2)) M = 1.0 - 1.0 / (1 + 10 ** Lopt * S2) noisevar = (dcty2 * M ** 2).mean() return noisevar class _Filter(object): def __init__(self, y, z0, weightstr, weights, s, robust, maxiter, tolz): self.y = y self.z0 = z0 self.weightstr = weightstr self.s = s self.robust = robust self.maxiter = maxiter self.tolz = tolz self.auto_smooth = s is None self.is_finite = np.isfinite(y) self.nof = self.is_finite.sum() # number of finite elements self.W = self._normalized_weights(weights, self.is_finite) self.gamma = self._gamma_fun(y) self.N = self._tensor_rank(y) self.s_min, self.s_max = self._smoothness_limits(self.N) # Initialize before iterating self.Wtot = self.W self.is_weighted = (self.W < 1).any() # Weighted or missing data? self.z0 = self._get_start_condition(y, z0) self.y[~self.is_finite] = 0 # arbitrary values for missing y-data # Error on p. Smoothness parameter s = 10^p self.errp = 0.1 # Relaxation factor RF: to speedup convergence self.RF = 1.75 if self.is_weighted else 1.0 @staticmethod def _tensor_rank(y): """tensor rank of the y-array""" return (np.array(y.shape) != 1).sum() @staticmethod def _smoothness_limits(n): """ Return upper and lower bound for the smoothness parameter The average leverage (h) is by definition in [0 1]. Weak smoothing occurs if h is close to 1, while over-smoothing appears when h is near 0. Upper and lower bounds for h are given to avoid under- or over-smoothing. See equation relating h to the smoothness parameter (Equation #12 in the referenced CSDA paper). """ h_min = 1e-6 ** (2. / n) h_max = 0.99 ** (2. / n) s_min = (((1 + sqrt(1 + 8 * h_max)) / 4. / h_max) ** 2 - 1) / 16 s_max = (((1 + sqrt(1 + 8 * h_min)) / 4. / h_min) ** 2 - 1) / 16 return s_min, s_max @staticmethod def _lambda_tensor(y): """ Return the Lambda tensor Lambda contains the eigenvalues of the difference matrix used in this penalized least squares process. """ d = y.ndim Lambda = np.zeros(y.shape) shape0 = [1, ] * d for i in range(d): shape0[i] = y.shape[i] Lambda = Lambda + \ np.cos(pi * np.arange(y.shape[i]) / y.shape[i]).reshape(shape0) shape0[i] = 1 Lambda = -2 * (d - Lambda) return Lambda def _gamma_fun(self, y): Lambda = self._lambda_tensor(y) def gamma(s): return 1. / (1 + s * Lambda ** 2) return gamma @staticmethod def _initial_guess(y, I): # Initial Guess with weighted/missing data # nearest neighbor interpolation (in case of missing values) z = y if (1 - I).any(): notI = ~I z, L = distance_transform_edt(notI, return_indices=True) z[notI] = y[L.flat[notI]] # coarse fast smoothing using one-tenth of the DCT coefficients shape = z.shape d = z.ndim z = dctn(z) for k in range(d): z[int((shape[k] + 0.5) / 10) + 1::, ...] = 0 z = z.reshape(np.roll(shape, -k)) z = z.transpose(np.roll(range(d), -1)) # z = shiftdim(z,1); return idctn(z) def _get_start_condition(self, y, z0): # Initial conditions for z if self.is_weighted: # With weighted/missing data # An initial guess is provided to ensure faster convergence. For # that purpose, a nearest neighbor interpolation followed by a # coarse smoothing are performed. if z0 is None: z = self._initial_guess(y, self.is_finite) else: z = z0 # an initial guess (z0) has been provided else: z = np.zeros(y.shape) return z @staticmethod def _normalized_weights(weight, is_finite): """ Return normalized weights. Zero weights are assigned to not finite values (Inf or NaN), (Inf/NaN values = missing data). """ weights = weight * is_finite _assert(np.all(0 <= weights), 'Weights must all be >=0') return weights / weights.max() @staticmethod def _studentized_residuals(r, I, h): median_abs_deviation = np.median(np.abs(r[I] - np.median(r[I]))) return np.abs(r / (1.4826 * median_abs_deviation) / sqrt(1 - h)) def robust_weights(self, r, I, h): """Return weights for robust smoothing.""" def bisquare(u): c = 4.685 return (1 - (u / c) ** 2) ** 2 * ((u / c) < 1) def talworth(u): c = 2.795 return u < c def cauchy(u): c = 2.385 return 1. / (1 + (u / c) ** 2) u = self._studentized_residuals(r, I, h) wfun = {'cauchy': cauchy, 'talworth': talworth}.get(self.weightstr, bisquare) weights = wfun(u) weights[np.isnan(weights)] = 0 return weights @staticmethod def _average_leverage(s, N): h = sqrt(1 + 16 * s) h = sqrt(1 + h) / sqrt(2) / h return h ** N def check_smooth_parameter(self, s): if self.auto_smooth: if np.abs(np.log10(s) - np.log10(self.s_min)) < self.errp: warnings.warn("""s = %g: the lower bound for s has been reached. Put s as an input variable if required.""" % s) elif np.abs(np.log10(s) - np.log10(self.s_max)) < self.errp: warnings.warn("""s = %g: the Upper bound for s has been reached. Put s as an input variable if required.""" % s) def gcv(self, p, aow, DCTy, y, Wtot): # Search the smoothing parameter s that minimizes the GCV score s = 10.0 ** p Gamma = self.gamma(s) if aow > 0.9: # aow = 1 means that all of the data are equally weighted # very much faster: does not require any inverse DCT residual = DCTy.ravel() * (Gamma.ravel() - 1) else: # take account of the weights to calculate RSS: is_finite = self.is_finite yhat = idctn(Gamma * DCTy) residual = sqrt(Wtot[is_finite]) * (y[is_finite] - yhat[is_finite]) TrH = Gamma.sum() RSS = linalg.norm(residual)**2 # Residual sum-of-squares GCVscore = RSS / self.nof / (1.0 - TrH / y.size) ** 2 return GCVscore def __call__(self, z, s): auto_smooth = self.auto_smooth norm = linalg.norm y = self.y Wtot = self.Wtot Gamma = 1 if s is not None: Gamma = self.gamma(s) # "amount" of weights (see the function GCVscore) aow = Wtot.sum() / y.size # 0 < aow <= 1 for nit in range(self.maxiter): DCTy = dctn(Wtot * (y - z) + z) if auto_smooth and not np.remainder(np.log2(nit + 1), 1): # The generalized cross-validation (GCV) method is used. # We seek the smoothing parameter s that minimizes the GCV # score i.e. s = Argmin(GCVscore). # Because this process is time-consuming, it is performed from # time to time (when nit is a power of 2) log10s = optimize.fminbound( self.gcv, np.log10(self.s_min), np.log10(self.s_max), args=(aow, DCTy, y, Wtot), xtol=self.errp, full_output=False, disp=False) s = 10 ** log10s Gamma = self.gamma(s) z0 = z z = self.RF * idctn(Gamma * DCTy) + (1 - self.RF) * z # if no weighted/missing data => tol=0 (no iteration) tol = norm(z0.ravel() - z.ravel()) / norm(z.ravel()) converged = tol <= self.tolz or not self.is_weighted if converged: break if self.robust: # -- Robust Smoothing: iteratively re-weighted process h = self._average_leverage(s, self.N) self.Wtot = self.W * self.robust_weights(y - z, self.is_finite, h) # re-initialize for another iterative weighted process self.is_weighted = True return z, s, converged class SmoothNd(object): def __init__(self, s=None, weight=None, robust=False, z0=None, tolz=1e-3, maxiter=100, fulloutput=False): self.s = s self.weight = weight self.robust = robust self.z0 = z0 self.tolz = tolz self.maxiter = maxiter self.fulloutput = fulloutput @property def weightstr(self): if isinstance(self._weight, str): return self._weight.lower() return 'bisquare' @property def weight(self): if self._weight is None or isinstance(self._weight, str): return 1.0 return self._weight @weight.setter def weight(self, weight): self._weight = weight def _init_filter(self, y): return _Filter(y, self.z0, self.weightstr, self.weight, self.s, self.robust, self.maxiter, self.tolz) @property def num_steps(self): return 3 if self.robust else 1 def __call__(self, data): y = np.atleast_1d(data) if y.size < 2: return data _filter = self._init_filter(y) z = _filter.z0 s = _filter.s converged = False for _i in range(self.num_steps): z, s, converged = _filter(z, s) if not converged: msg = """Maximum number of iterations (%d) has been exceeded. Increase MaxIter option or decrease TolZ value.""" % (self.maxiter) warnings.warn(msg) _filter.check_smooth_parameter(s) if self.fulloutput: return z, s return z def smoothn(data, s=None, weight=None, robust=False, z0=None, tolz=1e-3, maxiter=100, fulloutput=False): """ SMOOTHN fast and robust spline smoothing for 1-D to N-D data. Parameters ---------- data : array like uniformly-sampled data array to smooth. Non finite values (NaN or Inf) are treated as missing values. s : real positive scalar smooting parameter. The larger S is, the smoother the output will be. Default value is automatically determined using the generalized cross-validation (GCV) method. weight : string or array weights weighting array of real positive values, that must have the same size as DATA. Note that a zero weight corresponds to a missing value. robust : bool If true carry out a robust smoothing that minimizes the influence of outlying data. tolz : real positive scalar Termination tolerance on Z (default = 1e-3) maxiter : scalar integer Maximum number of iterations allowed (default = 100) z0 : array-like Initial value for the iterative process (default = original data) Returns ------- z : array like smoothed data To be made ---------- Estimate the confidence bands (see Wahba 1983, Nychka 1988). Reference --------- Garcia D, Robust smoothing of gridded data in one and higher dimensions with missing values. Computational Statistics & Data Analysis, 2010. http://www.biomecardio.com/pageshtm/publi/csda10.pdf Examples: -------- 1-D example >>> import matplotlib.pyplot as plt >>> import wafo.sg_filter as ws >>> x = np.linspace(0, 100, 2**8) >>> noise = np.random.randn(2**8)/10 >>> noise = ws.noise >>> y = np.cos(x/10)+(x/50)**2 + noise >>> y[np.r_[70, 75, 80]] = np.array([5.5, 5, 6]) >>> y[181] = np.nan >>> z = ws.smoothn(y) # Regular smoothing >>> np.allclose(z[:3], [ 0.99517904, 0.99372346, 0.99079798]) True >>> zr = ws.smoothn(y,robust=True) # Robust smoothing >>> np.allclose(zr[:3], [ 1.01190564, 1.00976197, 1.00513244]) True h=plt.subplot(121), h = plt.plot(x,y,'r.',x,z,'k',linewidth=2) h=plt.title('Regular smoothing') h=plt.subplot(122) h=plt.plot(x,y,'r.',x,zr,'k',linewidth=2) h=plt.title('Robust smoothing') 2-D example >>> xp = np.r_[0:1:.02] >>> [x,y] = np.meshgrid(xp,xp) >>> f = np.exp(x+y) + np.sin((x-2*y)*3) >>> fn = f + np.random.randn(*f.shape)*0.5 >>> fs = smoothn(fn) h=plt.subplot(121), h=plt.contourf(xp, xp, fn) h=plt.subplot(122) h=plt.contourf(xp, xp, fs) 2-D example with missing data >>> import wafo.demos as wd >>> n = 256 >>> x0, y0, z0 = wd.peaks(n) >>> z = z0 + rand(size(y0))*2 I = randperm(n**2) z[I(1:n^2*0.5)] = np.NaN; # lose 1/2 of data z[40:90, 140:190] = np.NaN; # create a hole zs = smoothn(z) plt.subplot(2,2,1) plt.imagesc(y) # , axis equal off plt.title('Noisy corrupt data') plt.subplot(223) plt.imagesc(z) # , axis equal off plt.title('Recovered data ...') plt.subplot(224) plt.imagesc(y0) # , axis equal off plt.title('... compared with original data') 3-D example [x,y,z] = meshgrid(-2:.2:2); xslice = [-0.8,1]; yslice = 2; zslice = [-2,0]; vn = x.*exp(-x.^2-y.^2-z.^2) + randn(size(x))*0.06; subplot(121), slice(x,y,z,vn,xslice,yslice,zslice,'cubic') title('Noisy data') v = smoothn(vn); subplot(122), slice(x,y,z,v,xslice,yslice,zslice,'cubic') title('Smoothed data') Cellular vortical flow [x,y] = meshgrid(linspace(0,1,24)); Vx = cos(2*pi*x+pi/2).*cos(2*pi*y); Vy = sin(2*pi*x+pi/2).*sin(2*pi*y); Vx = Vx + sqrt(0.05)*randn(24,24); adding Gaussian noise Vy = Vy + sqrt(0.05)*randn(24,24); adding Gaussian noise I = randperm(numel(Vx)); Vx(I(1:30)) = (rand(30,1)-0.5)*5; adding outliers Vy(I(1:30)) = (rand(30,1)-0.5)*5; adding outliers Vx(I(31:60)) = NaN; missing values Vy(I(31:60)) = NaN; missing values Vs = smoothn(complex(Vx,Vy),'robust'); automatic smoothing subplot(121), quiver(x,y,Vx,Vy,2.5), axis square title('Noisy velocity field') subplot(122), quiver(x,y,real(Vs),imag(Vs)), axis square title('Smoothed velocity field') See also ------- SmoothNd -- Damien Garcia -- 2009/03, revised 2010/11 Visit http://www.biomecardio.com/matlab/smoothn.html for more details about SMOOTHN """ return SmoothNd(s, weight, robust, z0, tolz, maxiter, fulloutput)(data) class HodrickPrescott(object): """Smooth data with a Hodrick-Prescott filter. The Hodrick-Prescott filter removes high frequency noise from data. It has the advantage of preserving the original shape and features of the signal better than other types of filtering approaches, such as moving averages techniques. Parameter --------- w : real scalar smooting parameter. Larger w means more smoothing. Values usually in the [100, 20000] interval. As w approach infinity H-P will approach a line. Examples -------- >>> import wafo.sg_filter as ws >>> t = np.linspace(-4, 4, 500) >>> y = np.exp( -t**2 ) + np.random.normal(0, 0.05, t.shape) >>> ysg = ws.HodrickPrescott(w=10000)(y) import matplotlib.pyplot as plt h = plt.plot(t, y, label='Noisy signal') h1 = plt.plot(t, np.exp(-t**2), 'k', lw=1.5, label='Original signal') h2 = plt.plot(t, ysg, 'r', label='Filtered signal') h3 = plt.legend() h4 = plt.title('Hodrick-Prescott') plt.show() References ---------- .. [1] E. T. Whittaker, On a new method of graduation. In proceedings of the Edinburgh Mathematical association., 1923, 78, pp 88-89. .. [2] R. Hodrick and E. Prescott, Postwar U.S. business cycles: an empirical investigation, Journal of money, credit and banking, 1997, 29 (1), pp 1-16. .. [3] Kim Hyeongwoo, Hodrick-Prescott filter, 2004, www.auburn.edu/~hzk0001/hpfilter.pdf """ def __init__(self, w=100): self.w = w def _get_matrix(self, n): w = self.w diag_matrix = np.repeat( np.atleast_2d([w, -4 * w, 6 * w + 1, -4 * w, w]).T, n, axis=1) A = spdiags(diag_matrix, np.arange(-2, 2 + 1), n, n).tocsr() A[0, 0] = A[-1, -1] = 1 + w A[1, 1] = A[-2, -2] = 1 + 5 * w A[0, 1] = A[1, 0] = A[-2, -1] = A[-1, -2] = -2 * w return A def __call__(self, x): x = np.atleast_1d(x).flatten() n = len(x) if n < 4: return x.copy() A = self._get_matrix(n) return spsolve(A, x) class Kalman(object): """ Kalman filter object - updates a system state vector estimate based upon an observation, using a discrete Kalman filter. The Kalman filter is "optimal" under a variety of circumstances. An excellent paper on Kalman filtering at the introductory level, without detailing the mathematical underpinnings, is: "An Introduction to the Kalman Filter" Greg Welch and Gary Bishop, University of North Carolina http://www.cs.unc.edu/~welch/kalman/kalmanIntro.html PURPOSE: The purpose of each iteration of a Kalman filter is to update the estimate of the state vector of a system (and the covariance of that vector) based upon the information in a new observation. The version of the Kalman filter in this function assumes that observations occur at fixed discrete time intervals. Also, this function assumes a linear system, meaning that the time evolution of the state vector can be calculated by means of a state transition matrix. USAGE: filt = Kalman(R, x, P, A, B=0, Q, H) x = filt(z, u=0) filt is a "system" object containing various fields used as input and output. The state estimate "x" and its covariance "P" are updated by the function. The other fields describe the mechanics of the system and are left unchanged. A calling routine may change these other fields as needed if state dynamics are time-dependent; otherwise, they should be left alone after initial values are set. The exceptions are the observation vector "z" and the input control (or forcing function) "u." If there is an input function, then "u" should be set to some nonzero value by the calling routine. System dynamics --------------- The system evolves according to the following difference equations, where quantities are further defined below: x = Ax + Bu + w meaning the state vector x evolves during one time step by premultiplying by the "state transition matrix" A. There is optionally (if nonzero) an input vector u which affects the state linearly, and this linear effect on the state is represented by premultiplying by the "input matrix" B. There is also gaussian process noise w. z = Hx + v meaning the observation vector z is a linear function of the state vector, and this linear relationship is represented by premultiplication by "observation matrix" H. There is also gaussian measurement noise v. where w ~ N(0,Q) meaning w is gaussian noise with covariance Q v ~ N(0,R) meaning v is gaussian noise with covariance R VECTOR VARIABLES: s.x = state vector estimate. In the input struct, this is the "a priori" state estimate (prior to the addition of the information from the new observation). In the output struct, this is the "a posteriori" state estimate (after the new measurement information is included). z = observation vector u = input control vector, optional (defaults to zero). MATRIX VARIABLES: s.A = state transition matrix (defaults to identity). s.P = covariance of the state vector estimate. In the input struct, this is "a priori," and in the output it is "a posteriori." (required unless autoinitializing as described below). s.B = input matrix, optional (defaults to zero). s.Q = process noise covariance (defaults to zero). s.R = measurement noise covariance (required). s.H = observation matrix (defaults to identity). NORMAL OPERATION: (1) define all state definition fields: A,B,H,Q,R (2) define intial state estimate: x,P (3) obtain observation and control vectors: z,u (4) call the filter to obtain updated state estimate: x,P (5) return to step (3) and repeat INITIALIZATION: If an initial state estimate is unavailable, it can be obtained from the first observation as follows, provided that there are the same number of observable variables as state variables. This "auto- intitialization" is done automatically if s.x is absent or NaN. x = inv(H)*z P = inv(H)*R*inv(H') This is mathematically equivalent to setting the initial state estimate covariance to infinity. Example (Automobile Voltimeter): ------- >>> import wafo.sg_filter as ws # Define the system as a constant of 12 volts: >>> V0 = 12 >>> h = 1 # voltimeter measure the voltage itself >>> q = 1e-5 # variance of process noise s the car operates >>> r = 0.1**2 # variance of measurement error >>> b = 0 # no system input >>> u = 0 # no system input >>> filt = ws.Kalman(R=r, A=1, Q=q, H=h, B=b) # Generate random voltages and watch the filter operate. >>> n = 50 >>> truth = np.random.randn(n)*np.sqrt(q) + V0 >>> z = truth + np.random.randn(n)*np.sqrt(r) # measurement >>> x = np.zeros(n) >>> for i, zi in enumerate(z): ... x[i] = filt(zi, u) # perform a Kalman filter iteration import matplotlib.pyplot as plt hz = plt.plot(z,'r.', label='observations') # a-posteriori state estimates: hx = plt.plot(x,'b-', label='Kalman output') ht = plt.plot(truth,'g-', label='true voltage') h = plt.legend() h1 = plt.title('Automobile Voltimeter Example') plt.show() """ def __init__(self, R, x=None, P=None, A=None, B=0, Q=None, H=None): self.R = R # Estimated error in measurements. self.x = x # Initial state estimate. self.P = P # Initial covariance estimate. self.A = A # State transition matrix. self.B = B # Control matrix. self.Q = Q # Estimated error in process. self.H = H # Observation matrix. self.reset() def reset(self): self._filter = self._filter_first def _set_A(self, n): if self.A is None: self.A = np.eye(n) self.A = np.atleast_2d(self.A) def _set_Q(self, n): if self.Q is None: self.Q = np.zeros((n, n)) self.Q = np.atleast_2d(self.Q) def _set_H(self, n): if self.H is None: self.H = np.eye(n) self.H = np.atleast_2d(self.H) def _set_P(self, HI): if self.P is None: self.P = np.dot(np.dot(HI, self.R), HI.T) self.P = np.atleast_2d(self.P) def _init_first(self, n): self._set_A(n) self._set_Q(n) self._set_H(n) try: HI = np.linalg.inv(self.H) except: HI = np.eye(n) self._set_P(HI) return HI def _first_state(self, z): n = np.size(z) HI = self._init_first(n) # initialize state estimate from first observation x = np.dot(HI, z) return x def _filter_first(self, z, u): self._filter = self._filter_main if self.x is None: self.x = self._first_state(z) return self.x n = np.size(self.x) self._init_first(n) return self._filter_main(z, u) def _predict_state(self, x, u): return np.dot(self.A, x) + np.dot(self.B, u) def _predict_covariance(self, P): A = self.A return np.dot(np.dot(A, P), A.T) + self.Q def _compute_gain(self, P): """Kalman gain factor.""" H = self.H PHT = np.dot(P, H.T) innovation_covariance = np.dot(H, PHT) + self.R # return np.linalg.solve(PHT, innovation_covariance) return np.dot(PHT, np.linalg.inv(innovation_covariance)) def _update_state_from_observation(self, x, z, K): innovation = z - np.dot(self.H, x) return x + np.dot(K, innovation) def _update_covariance(self, P, K): return P - np.dot(K, np.dot(self.H, P)) # return np.dot(np.eye(len(P)) - K * self.H, P) def _filter_main(self, z, u): """ This is the code which implements the discrete Kalman filter: """ P = self._predict_covariance(self.P) x = self._predict_state(self.x, u) K = self._compute_gain(P) self.P = self._update_covariance(P, K) self.x = self._update_state_from_observation(x, z, K) return self.x def __call__(self, z, u=0): return self._filter(z, u) class HampelFilter(object): """Hampel Filter. HAMPEL(X,Y,DX,T,varargin) returns the Hampel filtered values of the elements in Y. It was developed to detect outliers in a time series, but it can also be used as an alternative to the standard median filter. X,Y are row or column vectors with an equal number of elements. The elements in Y should be Gaussian distributed. Parameters ---------- dx : positive scalar (default 3 * median(diff(X)) which defines the half width of the filter window. Dx should be dimensionally equivalent to the values in X. t : positive scalar (default 3) which defines the threshold value used in the equation |Y - Y0| > T * S0. adaptive: real scalar if greater than 0 it uses an experimental adaptive Hampel filter. If none it uses a standard Hampel filter fulloutput: bool if True also the vectors: outliers, Y0,LB,UB,ADX, which corresponds to the mask of the replaced values, nominal data, lower and upper bounds on the Hampel filter and the relative half size of the local window, respectively. outliers.sum() gives the number of outliers detected. Examples --------- Hampel filter removal of outliers >>> import numpy as np >>> randint = np.random.randint >>> Y = 5000 + np.random.randn(1000) >>> outliers = randint(0,1000, size=(10,)) >>> Y[outliers] = Y[outliers] + randint(1000, size=(10,)) >>> YY, res = HampelFilter(fulloutput=True)(Y) >>> YY1, res1 = HampelFilter(dx=1, t=3, adaptive=0.1, fulloutput=True)(Y) >>> YY2, res2 = HampelFilter(dx=3, t=0, fulloutput=True)(Y) # Y0 = median X = np.arange(len(YY)) plt.plot(X, Y, 'b.') # Original Data plt.plot(X, YY, 'r') # Hampel Filtered Data plt.plot(X, res['Y0'], 'b--') # Nominal Data plt.plot(X, res['LB'], 'r--') # Lower Bounds on Hampel Filter plt.plot(X, res['UB'], 'r--') # Upper Bounds on Hampel Filter i = res['outliers'] plt.plot(X[i], Y[i], 'ks') # Identified Outliers plt.show('hold') References ---------- Chapters 1.4.2, 3.2.2 and 4.3.4 in Mining Imperfect Data: Dealing with Contamination and Incomplete Records by Ronald K. Pearson. Acknowledgements I would like to thank Ronald K. Pearson for the introduction to moving window filters. Please visit his blog at: http://exploringdatablog.blogspot.com/2012/01/moving-window-filters-and -pracma.html """ def __init__(self, dx=None, t=3, adaptive=None, fulloutput=False): self.dx = dx self.t = t self.adaptive = adaptive self.fulloutput = fulloutput @staticmethod def _check(dx): _assert(np.isscalar(dx), 'DX must be a scalar.') _assert(0 < dx, 'DX must be larger than zero.') @staticmethod def localwindow(X, Y, DX, i): mask = (X[i] - DX <= X) & (X <= X[i] + DX) Y0 = np.median(Y[mask]) # Calculate Local Scale of Natural Variation S0 = 1.4826 * np.median(np.abs(Y[mask] - Y0)) return Y0, S0 @staticmethod def smgauss(X, V, DX): Xj = X Xk = np.atleast_2d(X).T Wjk = np.exp(-((Xj - Xk) / (2 * DX)) ** 2) G = np.dot(Wjk, V) / np.sum(Wjk, axis=0) return G def _adaptive(self, Y, X, dx): localwindow = self.localwindow Y0, S0, ADX = self._init(Y, dx) Y0Tmp = np.nan * np.zeros(Y.shape) S0Tmp = np.nan * np.zeros(Y.shape) DXTmp = np.arange(1, len(S0) + 1) * dx # Integer variation of Window Half Size # Calculate Initial Guess of Optimal Parameters Y0, S0, ADX for i in range(len(Y)): j = 0 S0Rel = np.inf while S0Rel > self.adaptive: Y0Tmp[j], S0Tmp[j] = localwindow(X, Y, DXTmp[j], i) if j > 0: S0Rel = np.abs((S0Tmp[j - 1] - S0Tmp[j]) / (S0Tmp[j - 1] + S0Tmp[j]) / 2) j += 1 Y0[i] = Y0Tmp[j - 2] S0[i] = S0Tmp[j - 2] ADX[i] = DXTmp[j - 2] / dx # Gaussian smoothing of relevant parameters DX = 2 * np.median(np.diff(X)) ADX = self.smgauss(X, ADX, DX) S0 = self.smgauss(X, S0, DX) Y0 = self.smgauss(X, Y0, DX) return Y0, S0, ADX def _init(self, Y, dx): S0 = np.nan * np.zeros(Y.shape) Y0 = np.nan * np.zeros(Y.shape) ADX = dx * np.ones(Y.shape) return Y0, S0, ADX def _fixed(self, Y, X, dx): localwindow = self.localwindow Y0, S0, ADX = self._init(Y, dx) for i in range(len(Y)): Y0[i], S0[i] = localwindow(X, Y, dx, i) return Y0, S0, ADX def _filter(self, Y, X, dx): if len(X) <= 1: Y0, S0, ADX = self._init(Y, dx) elif self.adaptive is None: Y0, S0, ADX = self._fixed(Y, X, dx) else: Y0, S0, ADX = self._adaptive(Y, X, dx) # 'adaptive' return Y0, S0, ADX def __call__(self, y, x=None): Y = np.atleast_1d(y).ravel() if x is None: x = range(len(Y)) X = np.atleast_1d(x).ravel() dx = 3 * np.median(np.diff(X)) if self.dx is None else self.dx self._check(dx) Y0, S0, ADX = self._filter(Y, X, dx) YY = Y.copy() T = self.t # Prepare Output self.UB = Y0 + T * S0 self.LB = Y0 - T * S0 outliers = np.abs(Y - Y0) > T * S0 # possible outliers np.putmask(YY, outliers, Y0) # YY[outliers] = Y0[outliers] self.outliers = outliers self.num_outliers = outliers.sum() self.ADX = ADX self.Y0 = Y0 if self.fulloutput: return YY, dict(outliers=outliers, Y0=Y0, LB=self.LB, UB=self.UB, ADX=ADX) return YY if __name__ == '__main__': from wafo.testing import test_docstrings test_docstrings(__file__)