diff --git a/pywafo/src/wafo/stats/__init__.py b/pywafo/src/wafo/stats/__init__.py index f08000f..79d3265 100644 --- a/pywafo/src/wafo/stats/__init__.py +++ b/pywafo/src/wafo/stats/__init__.py @@ -1,13 +1,9 @@ -""" -Statistics package in WAFO Toolbox. - - Readme - Readme file for module STATS in WAFO Toolbox - -""" +""" +Statistics package in WAFO Toolbox. + + Readme - Readme file for module STATS in WAFO Toolbox + +""" from scipy.stats import * -from wafo.stats.core import * -from wafo.stats.distributions import * -#from wafo.spectrum.core import SpecData1D -#import wafo.spectrum.models -#import wafo.spectrum.dispersion_relation -#from wafo.data_structures import SpecData1D \ No newline at end of file +from wafo.stats.core import * +from wafo.stats.distributions import * diff --git a/pywafo/src/wafo/stats/core.py b/pywafo/src/wafo/stats/core.py index 5e3ad13..be470cc 100644 --- a/pywafo/src/wafo/stats/core.py +++ b/pywafo/src/wafo/stats/core.py @@ -1,10 +1,34 @@ from __future__ import division +import warnings from wafo.wafodata import WafoData +from wafo.misc import findextrema +from scipy import special import numpy as np from numpy import inf -from numpy import atleast_1d -from numpy import arange, floor -__all__ = ['edf'] +from numpy import atleast_1d, nan, ndarray, sqrt, vstack, ones, where, zeros +from numpy import arange, floor, linspace, asarray, reshape, repeat, product + + +__all__ = ['edf', 'edfcnd'] + +arr = asarray + +def valarray(shape, value=nan, typecode=None): + """Return an array of all value. + """ + out = reshape(repeat([value], product(shape, axis=0), axis=0), shape) + if typecode is not None: + out = out.astype(typecode) + if not isinstance(out, ndarray): + out = arr(out) + return out + +def _invt(q, df): + return special.stdtrit(df, q) + +def _invchi2(q, df): + return special.chdtri(df, q) + def edf(x, method=2): ''' @@ -25,7 +49,7 @@ def edf(x, method=2): >>> x = np.linspace(0,6,200) >>> R = ws.rayleigh.rvs(scale=2,size=100) >>> F = ws.edf(R) - >>> F.plot() + >>> h = F.plot() See also edf, pdfplot, cumtrapz ''' @@ -64,10 +88,10 @@ def edfcnd(x, c=None, method=2): >>> import wafo.stats as ws >>> x = np.linspace(0,6,200) >>> R = ws.rayleigh.rvs(scale=2,size=100) - >>> Fc = ws.edfcd(R, 1) - >>> Fc.plot() + >>> Fc = ws.edfcnd(R, 1) + >>> hc = Fc.plot() >>> F = ws.edf(R) - >>> F.plot() + >>> h = F.plot() See also edf, pdfplot, cumtrapz ''' @@ -84,3 +108,586 @@ def edfcnd(x, c=None, method=2): F.labels.ylab = 'F(x| X>=%g)' % c return F + + +def reslife(data, u=None, umin=None, umax=None, nu=None, nmin=3, alpha=0.05, plotflag=False): + ''' + Return Mean Residual Life, i.e., mean excesses vs thresholds + + Parameters + --------- + data : array_like + vector of data of length N. + u : array-like + threshold values (default linspace(umin, umax, nu)) + umin, umax : real scalars + Minimum and maximum threshold, respectively (default min(data), max(data)). + nu : scalar integer + number of threshold values (default min(N-nmin,100)) + nmin : scalar integer + Minimum number of extremes to include. (Default 3). + alpha : real scalar + Confidence coefficient (default 0.05) + plotflag: bool + + + Returns + ------- + mrl : WafoData object + Mean residual life values, i.e., mean excesses over thresholds, u. + + Notes + ----- + RESLIFE estimate mean excesses over thresholds. The purpose of MRL is + to determine the threshold where the upper tail of the data can be + approximated with the generalized Pareto distribution (GPD). The GPD is + appropriate for the tail, if the MRL is a linear function of the + threshold, u. Theoretically in the GPD model + + E(X-u0|X>u0) = s0/(1+k) + E(X-u |X>u) = s/(1+k) = (s0 -k*u)/(1+k) for u>u0 + + where k,s is the shape and scale parameter, respectively. + s0 = scale parameter for threshold u0>> import wafo + >>> R = wafo.stats.genpareto.rvs(0.1,2,2,size=100) + >>> mrl = reslife(R,nu=20) + >>> h = mrl.plot() + + See also + --------- + genpareto + fitgenparrange, disprsnidx + ''' + if u is None: + sd = np.sort(data) + n = len(data) + + nmin = max(nmin, 0) + if 2 * nmin > n: + warnings.warn('nmin possibly too large!') + + sdmax, sdmin = sd[-nmin], sd[0] + umax = sdmax if umax is None else min(umax, sdmax) + umin = sdmin if umin is None else max(umin, sdmin) + + if nu is None: + nu = min(n - nmin, 100) + + u = linspace(umin, umax, nu) + + + nu = len(u) + + #mrl1 = valarray(nu) + #srl = valarray(nu) + #num = valarray(nu) + + mean_and_std = lambda data1 : (data1.mean(), data1.std(), data1.size) + dat = arr(data) + tmp = arr([mean_and_std(dat[dat > tresh] - tresh) for tresh in u.tolist()]) + + mrl, srl, num = tmp.T + p = 1 - alpha + alpha2 = alpha / 2 + + # Approximate P% confidence interval + #%Za = -invnorm(alpha2); % known mean + Za = -_invt(alpha2, num - 1) # unknown mean + mrlu = mrl + Za * srl / sqrt(num) + mrll = mrl - Za * srl / sqrt(num) + + #options.CI = [mrll,mrlu]; + #options.numdata = num; + titleTxt = 'Mean residual life with %d%s CI' % (100 * p, '%') + res = WafoData(mrl, u, xlab='Threshold', ylab='Mean Excess', title=titleTxt) + res.workspace = dict(numdata=num, umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha) + res.children = [WafoData(vstack([mrll, mrlu]).T, u, xlab='Threshold', title=titleTxt)] + res.children_args = [':r'] + if plotflag: + res.plot() + return res + +def dispersion_idx(data, t=None, u=None, umin=None, umax=None, nu=None, nmin=10, tb=1, + alpha=0.05, plotflag=False): + '''Return Dispersion Index vs threshold + + Parameters + ---------- + data, ti : array_like + data values and sampled times, respectively. + u : array-like + threshold values (default linspace(umin, umax, nu)) + umin, umax : real scalars + Minimum and maximum threshold, respectively (default min(data), max(data)). + nu : scalar integer + number of threshold values (default min(N-nmin,100)) + nmin : scalar integer + Minimum number of extremes to include. (Default 10). + tb : Real scalar + Block period (same unit as the sampled times) (default 1) + alpha : real scalar + Confidence coefficient (default 0.05) + plotflag: bool + + Returns + ------- + DI : WafoData object + Dispersion index + b_u : real scalar + threshold where the number of exceedances in a fixed period (Tb) is + consistent with a Poisson process. + ok_u : array-like + all thresholds where the number of exceedances in a fixed period (Tb) is + consistent with a Poisson process. + Notes + ------ + DISPRSNIDX estimate the Dispersion Index (DI) as function of threshold. + DI measures the homogenity of data and the purpose of DI is to determine + the threshold where the number of exceedances in a fixed period (Tb) is + consistent with a Poisson process. For a Poisson process the DI is one. + Thus the threshold should be so high that DI is not significantly + different from 1. + + The Poisson hypothesis is not rejected if the estimated DI is between: + + chi2(alpha/2, M-1)/(M-1)< DI < chi^2(1 - alpha/2, M-1 }/(M - 1) + + where M is the total number of fixed periods/blocks -generally + the total number of years in the sample. + + Example + ------- + >>> import wafo.data + >>> xn = wafo.data.sea() + >>> t, data = xn.T + >>> Ie = findpot(data,t,0,5); + >>> di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100) + >>> h = di.plot() # a threshold around 1 seems appropriate. + + vline(u) + + See also + -------- + reslife, + fitgenparrange, + extremal_idx + + + References + ---------- + Ribatet, M. A.,(2006), + A User's Guide to the POT Package (Version 1.0) + month = {August}, + url = {http://cran.r-project.org/} + + Cunnane, C. (1979) Note on the poisson assumption in + partial duration series model. Water Resource Research, 15\bold{(2)} + :489--494.} + ''' + +# This program is free software; you can redistribute it and/or modify it under the terms of the GNU +# General Public License as published by the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, but without any warranty; without even +# the implied warranty of merchantability or fitness for a particular purpose. See the GNU General Public +# License for moredetails. +# The GNU General Public License can be obtained from http://www.gnu.org/copyleft/gpl.html. You +# can also obtain it by writing to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + + + n = len(data) + if t is None: + ti = arange(n) + else: + ti = arr(t) - min(t) + + t1 = np.empty(ti.shape,dtype=int) + t1[:] = np.floor(ti / tb) + + + if u is None: + sd = np.sort(data) + + + nmin = max(nmin, 0) + if 2 * nmin > n: + warnings.warn('nmin possibly too large!') + + sdmax, sdmin = sd[-nmin], sd[0] + umax = sdmax if umax is None else min(umax, sdmax) + umin = sdmin if umin is None else max(umin, sdmin) + + if nu is None: + nu = min(n - nmin, 100) + + u = linspace(umin, umax, nu) + + + + nu = len(u) + + di = np.zeros(nu) + + d = arr(data) + + mint = int(min(t1)) #; % mint should be 0. + maxt = int(max(t1)) + M = maxt - mint + 1; + occ = np.zeros(M); + + for ix, tresh in enumerate(u.tolist()): + excess = (d > tresh) + lambda_ = excess.sum() / M + for block in range(M): + occ[block] = sum(excess[t1 == block]) + + di[ix] = occ.var() / lambda_ + + p = 1 - alpha + + diUp = _invchi2(1 - alpha / 2, M - 1) / (M - 1) + diLo = _invchi2(alpha / 2, M - 1) / (M - 1) + + # Find appropriate threshold + k1, = np.where((diLo < di) & (di < diUp)) + if len(k1) > 0: + ok_u = u[k1] + b_di = (di[k1].mean() < di[k1]) + k = b_di.argmax() + b_u = ok_u[k] + else: + b_u = ok_u = None + + CItxt = '%d%s CI' % (100 * p, '%') + titleTxt = 'Dispersion Index plot'; + + res = WafoData(di, u, title=titleTxt, labx='Threshold', laby='Dispersion Index') + #'caption',CItxt); + res.workspace = dict(umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha) + res.children = [WafoData(vstack([diLo * ones(nu), diUp * ones(nu)]).T, u, xlab='Threshold', title=CItxt)] + res.children_args = ['--r'] + if plotflag: + res.plot(di) + return res, b_u, ok_u + +def decluster(data, t=None, thresh=None, tmin=1): + ''' + Return declustered peaks over threshold values + + Parameters + ---------- + data, t : array-like + data-values and sampling-times, respectively. + thresh : real scalar + minimum threshold for levels in data. + tmin : real scalar + minimum distance to another peak [same unit as t] (default 1) + + Returns + ------- + ev, te : ndarray + extreme values and its corresponding sampling times, respectively, i.e., + all data > thresh which are at least tmin distance apart. + + Example + ------- + >>> import pylab + >>> import wafo.data + >>> from wafo.misc import findtc + >>> x = wafo.data.sea() + >>> t, data = x[:400,:].T + >>> itc, iv = findtc(data,0,'dw') + >>> ytc, ttc = data[itc], t[itc] + >>> ymin = 2*data.std() + >>> tmin = 10 # sec + >>> [ye, te] = decluster(ytc,ttc, ymin,tmin); + >>> h = pylab.plot(t,data,ttc,ytc,'ro',t,zeros(len(t)),':',te,ye,'k.') + + See also + -------- + fitgenpar, findpot, extremalidx + ''' + if t is None: + t = np.arange(len(data)) + i = findpot(data, t, thresh, tmin) + return data[i], t[i] + +def findpot(data, t=None, thresh=None, tmin=1): + ''' + Retrun indices to Peaks over threshold values + + Parameters + ---------- + data, t : array-like + data-values and sampling-times, respectively. + thresh : real scalar + minimum threshold for levels in data. + tmin : real scalar + minimum distance to another peak [same unit as t] (default 1) + + Returns + ------- + Ie : ndarray + indices to extreme values, i.e., all data > tresh which are at least + tmin distance apart. + + Example + ------- + >>> import pylab + >>> import wafo.data + >>> from wafo.misc import findtc + >>> x = wafo.data.sea() + >>> t, data = x.T + >>> itc, iv = findtc(data,0,'dw') + >>> ytc, ttc = data[itc], t[itc] + >>> ymin = 2*data.std() + >>> tmin = 10 # sec + >>> I = findpot(data, t, ymin, tmin) + >>> yp, tp = data[I], t[I] + >>> Ie = findpot(yp, tp, ymin,tmin) + >>> ye, te = yp[Ie], tp[Ie] + >>> h = pylab.plot(t,data,ttc,ytc,'ro',t,zeros(len(t)),':',te, ye,'k.',tp,yp,'+') + + See also + -------- + fitgenpar, decluster, extremalidx + ''' + Data = arr(data) + if t is None: + ti = np.arange(len(Data)) + else: + ti = arr(t) + + Ie, = where(Data > thresh); + Ye = Data[Ie] + Te = ti[Ie] + if len(Ye) <= 1: + return Ie + + dT = np.diff(Te) + notSorted = np.any(dT < 0); + if notSorted: + I = np.argsort(Te) + Te = Te[I] + Ie = Ie[I] + Ye = Ye[I] + dT = np.diff(Te) + + isTooSmall = (dT <= tmin) + + if np.any(isTooSmall): + isTooClose = np.hstack((isTooSmall[0], isTooSmall[:-1] | isTooSmall[1:], isTooSmall[-1])) + + #Find opening (NO) and closing (NC) index for data beeing to close: + iy = findextrema(np.hstack([0, 0, isTooSmall, 0])) + + NO = iy[::2] - 1 + NC = iy[1::2] + + for no, nc in zip(NO, NC): + iz = slice(no, nc) + iOK = _find_ok_peaks(Ye[iz], Te[iz], tmin) + if len(iOK): + isTooClose[no + iOK] = 0 + # Remove data which is too close to other data. + if isTooClose.any(): + #len(tooClose)>0: + iOK, = where(1 - isTooClose) + Ie = Ie[iOK] + + return Ie + + +def _find_ok_peaks(Ye, Te, Tmin): + ''' + Return indices to the largest maxima that are at least Tmin + distance apart. + ''' + Ny = len(Ye) + + I = np.argsort(-Ye) # sort in descending order + + Te1 = Te[I] + oOrder = zeros(Ny, dtype=int) + oOrder[I] = range(Ny) #indices to the variables original location + + isTooClose = zeros(Ny, dtype=bool) + + pool = zeros((Ny, 2)) + T_range = np.hstack([-Tmin, Tmin]) + K = 0 + for i, ti in enumerate(Te1): + isTooClose[i] = np.any((pool[:K, 0] <= ti) & (ti <= pool[:K, 1])) + if not isTooClose[i]: + pool[K] = ti + T_range + K += 1 + + iOK, = where(1 - isTooClose[oOrder]) + return iOK + + + +def declustering_time(t): + ''' + Returns minimum distance between clusters. + + Parameters + ---------- + t : array-like + sampling times for data. + + Returns + ------- + tc : real scalar + minimum distance between clusters. + + Example + ------- + >>> import wafo.data + >>> x = wafo.data.sea() + >>> t, data = x[:400,:].T + >>> Ie = findpot(data,t,0,5); + >>> tc = declustering_time(Ie) + >>> tc + 21 + + ''' + t0 = arr(t) + nt = len(t0) + if nt<2: + return arr([]) + ti = interexceedance_times(t0) + ei = extremal_idx(ti) + if ei==1: + tc = ti.min() + else: + i = int(np.floor(nt*ei)) + sti = -np.sort(-ti) + tc = sti[min(i, nt-2)] #% declustering time + return tc + + +def interexceedance_times(t): + ''' + Returns interexceedance times of data + + Parameters + ---------- + t : array-like + sampling times for data. + Returns + ------- + ti : ndarray + interexceedance times + + Example + ------- + >>> t = [1,2,5,10] + >>> interexceedance_times(t) + array([1, 3, 5]) + + ''' + return np.diff(np.sort(t)) + +def extremal_idx(ti): + ''' + Returns Extremal Index measuring the dependence of data + + Parameters + ---------- + ti : array-like + interexceedance times for data. + + Returns + ------- + ei : real scalar + Extremal index. + + Notes + ----- + The Extremal Index (EI) is one if the data are independent and less than + one if there are some dependence. The extremal index can also be intepreted + as the reciprocal of the mean cluster size. + + Example + ------- + >>> import wafo.data + >>> x = wafo.data.sea() + >>> t, data = x[:400,:].T + >>> Ie = findpot(data,t,0,5); + >>> ti = interexceedance_times(Ie) + >>> ei = extremal_idx(ti) + >>> ei + 1 + + See also + -------- + reslife, fitgenparrange, disprsnidx, findpot, decluster + + + Reference + --------- + Christopher A. T. Ferro, Johan Segers (2003) + Inference for clusters of extreme values + Journal of the Royal Statistical society: Series B (Statistical Methodology) 54 (2), 545-556 + doi:10.1111/1467-9868.00401 + ''' + t = arr(ti) + tmax = t.max() + if tmax<=1: + ei = 0 + elif tmax<=2: + ei = min(1, 2*t.mean()**2/((t**2).mean())) + else: + ei = min(1, 2*np.mean(t-1)**2/np.mean((t-1)*(t-2))) + return ei + + +def _test_dispersion_idx(): + import wafo.data + xn = wafo.data.sea() + t, data = xn.T + Ie = findpot(data,t,0,5); + di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100) + di.plot() # a threshold around 1 seems appropriate. + di.show() + pass + +def _test_findpot(): + import pylab + import wafo.data + from wafo.misc import findtc + x = wafo.data.sea() + t, data = x[:, :].T + itc, iv = findtc(data, 0, 'dw') + ytc, ttc = data[itc], t[itc] + ymin = 2 * data.std() + tmin = 10 # sec + I = findpot(data, t, ymin, tmin) + yp, tp = data[I], t[I] + Ie = findpot(yp, tp, ymin, tmin) + ye, te = yp[Ie], tp[Ie] + h = pylab.plot(t, data, ttc,ytc,'ro', t, zeros(len(t)), ':', te, ye, 'kx', tp, yp, '+') + pylab.show() # + pass + +def _test_reslife(): + import wafo + R = wafo.stats.genpareto.rvs(0.1, 2, 2, size=100) + mrl = reslife(R, nu=20) + mrl.plot() + +def main(): + #_test_dispersion_idx() + import doctest + doctest.testmod() + + +if __name__ == '__main__': + main() diff --git a/pywafo/src/wafo/stats/distributions.py b/pywafo/src/wafo/stats/distributions.py index 358a897..d9d5c36 100644 --- a/pywafo/src/wafo/stats/distributions.py +++ b/pywafo/src/wafo/stats/distributions.py @@ -400,15 +400,15 @@ class rv_frozen(object): def stats(self, moments='mv'): ''' Some statistics of the given RV''' kwds = dict(moments=moments) - return self.dist.stats(*self.par, **kwds) + return self.dist.stats(*self.par) def median(self): - return self.dist.median(*self.par, **self.kwds) + return self.dist.median(*self.par) def mean(self): - return self.dist.mean(*self.par,**self.kwds) + return self.dist.mean(*self.par) def var(self): - return self.dist.var(*self.par, **self.kwds) + return self.dist.var(*self.par) def std(self): - return self.dist.std(*self.par, **self.kwds) + return self.dist.std(*self.par) def moment(self, n): par1 = self.par[:self.dist.numargs] return self.dist.moment(n, *par1) @@ -418,7 +418,7 @@ class rv_frozen(object): '''Probability mass function at k of the given RV''' return self.dist.pmf(k, *self.par) def interval(self,alpha): - return self.dist.interval(alpha, *self.par, **self.kwds) + return self.dist.interval(alpha, *self.par) # Frozen RV class @@ -1982,10 +1982,8 @@ class rv_continuous(rv_generic): def link(self, x, logSF, theta, i): ''' Return dist. par. no. i as function of quantile (x) and log survival probability (sf) - - Assumptions: - ------------ - theta is list containing all parameters including location and scale. + where + theta is the list containing all parameters including location and scale. ''' raise ValueError('Link function not implemented for the %s distribution' % self.name) return None @@ -2077,7 +2075,7 @@ class rv_continuous(rv_generic): # invert the Hessian matrix (i.e. invert the observed information number) #pcov = -pinv(H); - return - H + return -H # return starting point for fit (shape arguments + loc + scale) def _fitstart(self, data, args=None): @@ -2231,25 +2229,6 @@ class rv_continuous(rv_generic): in your namespace will be sorted as well. ''' return FitDistribution(self, data, *args, **kwds) - -# loc0, scale0, method = map(kwds.get, ['loc', 'scale','method'],[none, none,'ml']) -# args, loc0, scale0 = self.fix_loc_scale(args, loc0, scale0) -# Narg = len(args) -# if Narg != self.numargs: -# if Narg > self.numargs: -# raise ValueError, "Too many input arguments." -# else: -# args += (1.0,)*(self.numargs-Narg) -# # location and scale are at the end -# x0 = args + (loc0, scale0) -# if method.lower()[:].startswith('mps'): -# data.sort() -# fitfun = self.nlogps -# else: -# fitfun = self.nnlf -# -# return optimize.fmin(fitfun,x0,args=(ravel(data),),disp=0) - def fit_loc_scale(self, data, *args): """ @@ -3387,6 +3366,20 @@ class genpareto_gen(rv_continuous): #vals = 1.0/c * (pow(1-q, -c)-1) #return vals + + def _fitstart(self, data): + d = arr(data) + loc = d.min()-0.01*d.std() + #moments estimator + d1 = d-loc + m = d1.mean() + s = d1.std() + + shape = ((m/s)**2 - 1)/2 + scale = m*((m/s)**2+1)/2 + return shape, loc, scale + + def hessian_nnlf(self, theta, x, eps=None): try: loc = theta[-2] @@ -6876,6 +6869,10 @@ Skellam distribution def main(): import matplotlib matplotlib.interactive(True) + R = norm.rvs(size=100) + phat = norm.fit(R) + phat = genpareto.fit(R[R>0.7],f0=0.1, floc=0.7) + #nbinom(10, 0.75).rvs(3) t = bernoulli(0.75).rvs(3) x = np.r_[5, 10] diff --git a/pywafo/src/wafo/stats/estimation.py b/pywafo/src/wafo/stats/estimation.py index 1b450b8..de0553e 100644 --- a/pywafo/src/wafo/stats/estimation.py +++ b/pywafo/src/wafo/stats/estimation.py @@ -10,21 +10,18 @@ from __future__ import division from wafo.plotbackend import plotbackend from wafo.misc import ecross, findcross -from scipy.misc.ppimport import ppimport +#from scipy.misc.ppimport import ppimport import numdifftools from scipy import special from scipy.linalg import pinv2 - - from scipy import optimize -from numpy import alltrue, arange, \ - ravel, ones, sum, \ - zeros, log, sqrt, exp -from numpy import atleast_1d, any, asarray, nan, inf, pi, reshape, repeat, product, ndarray + import numpy import numpy as np - +from numpy import alltrue, arange, ravel, ones, sum, zeros, log, sqrt, exp +from numpy import (atleast_1d, any, asarray, nan, pi, reshape, repeat, + product, ndarray, isfinite) from numpy import flatnonzero as nonzero @@ -33,11 +30,8 @@ __all__ = [ ] floatinfo = np.finfo(float) - - #arr = atleast_1d arr = asarray - all = alltrue def chi2isf(p, df): @@ -123,13 +117,13 @@ class rv_frozen(object): kwds = dict(moments=moments) return self.dist.stats(*self.par, **kwds) def median(self): - return self.dist.median(*self.par, **self.kwds) + return self.dist.median(*self.par) def mean(self): - return self.dist.mean(*self.par,**self.kwds) + return self.dist.mean(*self.par) def var(self): - return self.dist.var(*self.par, **self.kwds) + return self.dist.var(*self.par) def std(self): - return self.dist.std(*self.par, **self.kwds) + return self.dist.std(*self.par) def moment(self, n): par1 = self.par[:self.dist.numargs] return self.dist.moment(n, *par1) @@ -139,9 +133,7 @@ class rv_frozen(object): '''Probability mass function at k of the given RV''' return self.dist.pmf(k, *self.par) def interval(self,alpha): - return self.dist.interval(alpha, *self.par, **self.kwds) - - + return self.dist.interval(alpha, *self.par) # internal class to profile parameters of a given distribution @@ -261,9 +253,9 @@ class Profile(object): self.i_free = nonzero(isfree) self.Lmax = Lmax - self.alpha_Lrange = 0.5 * chi2isf(self.alpha, 1) #_WAFODIST.chi2.isf(self.alpha, 1) + self.alpha_Lrange = 0.5 * chi2isf(self.alpha, 1) self.alpha_cross_level = Lmax - self.alpha_Lrange - lowLevel = self.alpha_cross_level - self.alpha_Lrange / 7.0 + #lowLevel = self.alpha_cross_level - self.alpha_Lrange / 7.0 ## Check that par are actually at the optimum phatv = fit_dist.par.copy() @@ -327,7 +319,7 @@ class Profile(object): cond = self.data == -numpy.inf if any(cond): ind, = cond.nonzero() - self.data.put(ind, numpy.finfo(float).min / 2.0) + self.data.put(ind, floatinfo.min / 2.0) ind1 = numpy.where(ind == 0, ind, ind - 1) cl = self.alpha_cross_level - self.alpha_Lrange / 2.0 t0 = ecross(self.args, self.data, ind1, cl) @@ -446,16 +438,17 @@ class FitDistribution(rv_frozen): def __init__(self, dist, data, *args, **kwds): extradoc = ''' - RV.plotfitsumry() - Plot various diagnostic plots to asses quality of fit. - RV.plotecdf() - Plot Empirical and fitted Cumulative Distribution Function - RV.plotesf() - Plot Empirical and fitted Survival Function - RV.plotepdf() - Plot Empirical and fitted Probability Distribution Function - RV.plotresq() - Displays a residual quantile plot. - RV.plotresprb() - Displays a residual probability plot. - - RV.profile() - Return Profile Log- likelihood or Product Spacing-function. - - Member variables: + RV.plotfitsumry() - Plot various diagnostic plots to asses quality of fit. + RV.plotecdf() - Plot Empirical and fitted Cumulative Distribution Function + RV.plotesf() - Plot Empirical and fitted Survival Function + RV.plotepdf() - Plot Empirical and fitted Probability Distribution Function + RV.plotresq() - Displays a residual quantile plot. + RV.plotresprb() - Displays a residual probability plot. + + RV.profile() - Return Profile Log- likelihood or Product Spacing-function. + + Member variables + ---------------- data - data used in fitting alpha - confidence coefficient method - method used @@ -475,86 +468,144 @@ class FitDistribution(rv_frozen): self.__doc__ = rv_frozen.__doc__ + extradoc self.dist = dist numargs = dist.numargs - - self.method, self.alpha, self.par_fix, self.search, self.copydata = map(kwds.get, ['method', 'alpha', 'par_fix', 'search', 'copydata'], ['ml', 0.05, None, True, True]) - self.data = ravel(data) - if self.copydata: - self.data = self.data.copy() - self.data.sort() + + self.method=self.alpha=self.par_fix=self.search=self.copydata=None + m_variables = ['method', 'alpha', 'par_fix', 'search', 'copydata'] + m_defaults = ['ml', 0.05, None, True, True] + for (name, val) in zip(m_variables,m_defaults): + setattr(self, name, kwds.get(name,val)) + + #self.method, self.alpha, self.par_fix, self.search, self.copydata = map(kwds.get, m_variables, m_defaults) if self.method.lower()[:].startswith('mps'): self._fitfun = dist.nlogps else: self._fitfun = dist.nnlf - - allfixed = False - isfinite = numpy.isfinite - somefixed = (self.par_fix != None) and any(isfinite(self.par_fix)) - + + self.data = ravel(data) + if self.copydata: + self.data = self.data.copy() + self.data.sort() + + par, fixedn = self._fit(*args, **kwds) + self.par = arr(par) + somefixed = len(fixedn)>0 if somefixed: - fitfun = self._fxfitfun - self.par_fix = tuple(self.par_fix) - allfixed = all(isfinite(self.par_fix)) - self.par = atleast_1d(self.par_fix) - self.i_notfixed = nonzero(1 - isfinite(self.par)) - self.i_fixed = nonzero(isfinite(self.par)) - if len(self.par) != numargs + 2: - raise ValueError, "Wrong number of input arguments." - if len(args) != len(self.i_notfixed): - raise ValueError("Length of args must equal number of non-fixed parameters given in par_fix! (%d) " % len(self.i_notfixed)) - x0 = atleast_1d(args) - else: - fitfun = self.fitfun - loc0, scale0 = map(kwds.get, ['loc', 'scale']) - args, loc0, scale0 = dist.fix_loc_scale(args, loc0, scale0) - Narg = len(args) - if Narg != numargs: - if Narg > numargs: - raise ValueError, "Too many input arguments." - else: - args += (1.0,)*(numargs - Narg) - # location and scale are at the end - x0 = args + (loc0, scale0) - x0 = atleast_1d(x0) - - numpar = len(x0) - if self.search and not allfixed: - #args=(self.data,), - par = optimize.fmin(fitfun, x0, disp=0) - if not somefixed: - self.par = par - elif (not allfixed) and somefixed: - self.par[self.i_notfixed] = x0 - else: - self.par = x0 - - np = numargs + 2 - - self.par_upper = None - self.par_lower = None - self.par_cov = zeros((np, np)) + self.par_fix = [nan,]*len(self.par) + for i in fixedn: + self.par_fix[i] = self.par[i] + + self.i_notfixed = nonzero(1 - isfinite(self.par_fix)) + self.i_fixed = nonzero(isfinite(self.par_fix)) + + numpar = numargs + 2 + self.par_cov = zeros((numpar, numpar)) + self._compute_cov() + + # Set confidence interval for parameters + pvar = numpy.diag(self.par_cov) + zcrit = -norm_ppf(self.alpha / 2.0) + self.par_lower = self.par - zcrit * sqrt(pvar) + self.par_upper = self.par + zcrit * sqrt(pvar) + self.LLmax = -dist.nnlf(self.par, self.data) self.LPSmax = -dist.nlogps(self.par, self.data) self.pvalue = self._pvalue(self.par, self.data, unknown_numpar=numpar) - H = numpy.asmatrix(self._hessian_nnlf(self.par, self.data)) + + def _reduce_func(self, args, kwds): + args = list(args) + Nargs = len(args) - 2 + fixedn = [] + index = range(Nargs) + [-2, -1] + names = ['f%d' % n for n in range(Nargs)] + ['floc', 'fscale'] + x0 = args[:] + for n, key in zip(index, names): + if kwds.has_key(key): + fixedn.append(n) + args[n] = kwds[key] + del x0[n] + + fitfun = self._fitfun + + if len(fixedn) == 0: + func = fitfun + restore = None + else: + if len(fixedn) == len(index): + raise ValueError, "All parameters fixed. There is nothing to optimize." + def restore(args, theta): + # Replace with theta for all numbers not in fixedn + # This allows the non-fixed values to vary, but + # we still call self.nnlf with all parameters. + i = 0 + for n in range(Nargs): + if n not in fixedn: + args[n] = theta[i] + i += 1 + return args + + def func(theta, x): + newtheta = restore(args[:], theta) + return fitfun(newtheta, x) + + return x0, func, restore, args, fixedn + + def _fit(self, *args, **kwds): + + dist = self.dist + data = self.data + + Narg = len(args) + if Narg > dist.numargs: + raise ValueError, "Too many input arguments." + start = [None]*2 + if (Narg < dist.numargs) or not (kwds.has_key('loc') and + kwds.has_key('scale')): + start = dist._fitstart(data) # get distribution specific starting locations + args += start[Narg:-2] + loc = kwds.get('loc', start[-2]) + scale = kwds.get('scale', start[-1]) + args += (loc, scale) + x0, func, restore, args, fixedn = self._reduce_func(args, kwds) + if self.search: + optimizer = kwds.get('optimizer', optimize.fmin) + # convert string to function in scipy.optimize + if not callable(optimizer) and isinstance(optimizer, (str, unicode)): + if not optimizer.startswith('fmin_'): + optimizer = "fmin_"+optimizer + if optimizer == 'fmin_': + optimizer = 'fmin' + try: + optimizer = getattr(optimize, optimizer) + except AttributeError: + raise ValueError, "%s is not a valid optimizer" % optimizer + vals = optimizer(func,x0,args=(ravel(data),),disp=0) + vals = tuple(vals) + else: + vals = tuple(x0) + if restore is not None: + vals = restore(args, vals) + return vals, fixedn + + def _compute_cov(self): + '''Compute covariance + ''' + somefixed = (self.par_fix != None) and any(isfinite(self.par_fix)) + H = numpy.asmatrix(self.dist.hessian_nnlf(self.par, self.data)) self.H = H try: - if allfixed: - pass - elif somefixed: - pcov = -pinv2(H[self.i_notfixed, :][..., self.i_notfixed]) - for row, ix in enumerate(list(self.i_notfixed)): - self.par_cov[ix, self.i_notfixed] = pcov[row, :] - + if somefixed: + allfixed = all(isfinite(self.par_fix)) + if allfixed: + self.par_cov[:,:]=0 + else: + pcov = -pinv2(H[self.i_notfixed, :][..., self.i_notfixed]) + for row, ix in enumerate(list(self.i_notfixed)): + self.par_cov[ix, self.i_notfixed] = pcov[row, :] else: self.par_cov = -pinv2(H) except: self.par_cov[:, :] = nan - - pvar = numpy.diag(self.par_cov) - zcrit = -norm_ppf(self.alpha / 2.0)#_WAFODIST.norm.ppf(self.alpha / 2.0) - self.par_lower = self.par - zcrit * sqrt(pvar) - self.par_upper = self.par + zcrit * sqrt(pvar) - + def fitfun(self, phat): return self._fitfun(phat, self.data) @@ -562,7 +613,6 @@ class FitDistribution(rv_frozen): self.par[self.i_notfixed] = phat10 return self._fitfun(self.par, self.data) - def profile(self, **kwds): ''' Profile Log- likelihood or Log Product Spacing- function, which can be used for constructing confidence interval for @@ -812,65 +862,7 @@ class FitDistribution(rv_frozen): return pvalue - def _hessian_nnlf(self, theta, data, eps=None): - ''' approximate hessian of nnlf where theta are the parameters (including loc and scale) - ''' - #Nd = len(x) - np = len(theta) - # pab 07.01.2001: Always choose the stepsize h so that - # it is an exactly representable number. - # This is important when calculating numerical derivatives and is - # accomplished by the following. - - if eps == None: - eps = (floatinfo.machar.eps) ** 0.4 - #xmin = floatinfo.machar.xmin - #myfun = lambda y: max(y,100.0*log(xmin)) #% trick to avoid log of zero - delta = (eps + 2.0) - 2.0 - delta2 = delta ** 2.0 - # % Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with - # % 1/(d^2 L(theta|x)/dtheta^2) - # % using central differences - - dist = self.dist - LL = dist.nnlf(theta, data) - H = zeros((np, np)) #%% Hessian matrix - theta = tuple(theta) - for ix in xrange(np): - sparam = list(theta) - sparam[ix] = theta[ix] + delta - fp = dist.nnlf(sparam, data) - #fp = sum(myfun(x)) - - sparam[ix] = theta[ix] - delta - fm = dist.nnlf(sparam, data) - #fm = sum(myfun(x)) - - H[ix, ix] = (fp - 2 * LL + fm) / delta2 - for iy in range(ix + 1, np): - sparam[ix] = theta[ix] + delta - sparam[iy] = theta[iy] + delta - fpp = dist.nnlf(sparam, data) - #fpp = sum(myfun(x)) - - sparam[iy] = theta[iy] - delta - fpm = dist.nnlf(sparam, data) - #fpm = sum(myfun(x)) - - sparam[ix] = theta[ix] - delta - fmm = dist.nnlf(sparam, data) - #fmm = sum(myfun(x)); - - sparam[iy] = theta[iy] + delta - fmp = dist.nnlf(sparam, data) - #fmp = sum(myfun(x)) - H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2) - H[iy, ix] = H[ix, iy] - sparam[iy] = theta[iy]; - - # invert the Hessian matrix (i.e. invert the observed information number) - #pcov = -pinv(H); - return - H + def main(): _WAFODIST = ppimport('wafo.stats.distributions') diff --git a/pywafo/src/wafo/wafodata.py b/pywafo/src/wafo/wafodata.py index f8766fa..d473240 100644 --- a/pywafo/src/wafo/wafodata.py +++ b/pywafo/src/wafo/wafodata.py @@ -52,39 +52,41 @@ class WafoData(object): >>> h = d2.plot() Plot with confidence interval - d3 = wdata(sin(x),x) - d3 = set(d3,'dataCI',[sin(x(:))*0.9 sin(x(:))*1.2]) - plot(d3) + >>> d3 = WafoData(np.sin(x),x) + >>> d3.children = [WafoData(np.vstack([np.sin(x)*0.9, np.sin(x)*1.2]).T,x)] + >>> d3.children_args=[':r'] + >>> h = d3.plot() See also -------- - wdata/plot, specdata, covdata ''' - def __init__(self, data=None, args=None,*args2,**kwds): + def __init__(self, data=None, args=None, *args2, **kwds): self.data = data self.args = args self.date = now() self.plotter = None self.children = None + self.children_args = [] self.labels = AxisLabels(**kwds) self.setplotter() - def plot(self,*args,**kwds): + def plot(self, *args, **kwds): tmp = None - if self.children!=None: + if self.children != None: plotbackend.hold('on') tmp = [] + child_args = args + tuple(self.children_args) for child in self.children: - tmp1 = child.plot(*args, **kwds) - if tmp1 !=None: + tmp1 = child.plot(*child_args, **kwds) + if tmp1 != None: tmp.append(tmp1) - if len(tmp)==0: + if len(tmp) == 0: tmp = None - tmp2 = self.plotter.plot(self,*args,**kwds) - return tmp2,tmp + tmp2 = self.plotter.plot(self, *args, **kwds) + return tmp2, tmp def show(self): self.plotter.show() @@ -95,20 +97,20 @@ class WafoData(object): return newcopy - def setplotter(self,plotmethod=None): + def setplotter(self, plotmethod=None): ''' Set plotter based on the data type data_1d, data_2d, data_3d or data_nd ''' - if isinstance(self.args,(list,tuple)): # Multidimensional data + if isinstance(self.args, (list, tuple)): # Multidimensional data ndim = len(self.args) - if ndim<2: + if ndim < 2: warnings.warn('Unable to determine plotter-type, because len(self.args)<2.') print('If the data is 1D, then self.args should be a vector!') print('If the data is 2D, then length(self.args) should be 2.') print('If the data is 3D, then length(self.args) should be 3.') print('Unless you fix this, the plot methods will not work!') - elif ndim==2: + elif ndim == 2: self.plotter = Plotter_2d(plotmethod) else: warnings.warn('Plotter method not implemented for ndim>2') @@ -118,7 +120,7 @@ class WafoData(object): class AxisLabels: - def __init__(self,title='',xlab='',ylab='',zlab='',**kwds): + def __init__(self, title='', xlab='', ylab='', zlab='', **kwds): self.title = title self.xlab = xlab self.ylab = ylab @@ -137,7 +139,7 @@ class AxisLabels: h2 = plotbackend.xlabel(self.xlab) h3 = plotbackend.ylabel(self.ylab) #h4 = plotbackend.zlabel(self.zlab) - return h1,h2,h3 + return h1, h2, h3 except: pass @@ -159,7 +161,7 @@ class Plotter_1d(object): scatter : scatter plot """ - def __init__(self,plotmethod='plot'): + def __init__(self, plotmethod='plot'): self.plotfun = None if plotmethod is None: plotmethod = 'plot' @@ -172,14 +174,14 @@ class Plotter_1d(object): def show(self): plotbackend.show() - def plot(self,wdata,*args,**kwds): - if isinstance(wdata.args,(list,tuple)): - args1 = tuple((wdata.args))+(wdata.data,)+args + def plot(self, wdata, *args, **kwds): + if isinstance(wdata.args, (list, tuple)): + args1 = tuple((wdata.args)) + (wdata.data,) + args else: - args1 = tuple((wdata.args,))+(wdata.data,)+args - h1 = self.plotfun(*args1,**kwds) + args1 = tuple((wdata.args,)) + (wdata.data,) + args + h1 = self.plotfun(*args1, **kwds) h2 = wdata.labels.labelfig() - return h1,h2 + return h1, h2 class Plotter_2d(Plotter_1d): """ @@ -192,10 +194,10 @@ class Plotter_2d(Plotter_1d): surf """ - def __init__(self,plotmethod='contour'): + def __init__(self, plotmethod='contour'): if plotmethod is None: plotmethod = 'contour' - super(Plotter_2d,self).__init__(plotmethod) + super(Plotter_2d, self).__init__(plotmethod) #self.plotfun = plotbackend.__dict__[plotmethod] @@ -207,4 +209,4 @@ if __name__ == '__main__': import doctest doctest.testmod() else: - main() \ No newline at end of file + main()