Added functions to stats.core.py

-reslife
-extremal_idx
-++
Improved estimation.py
master
Per.Andreas.Brodtkorb 15 years ago
parent a490e23e2f
commit df05bec641

@ -1,13 +1,9 @@
""" """
Statistics package in WAFO Toolbox. Statistics package in WAFO Toolbox.
Readme - Readme file for module STATS in WAFO Toolbox Readme - Readme file for module STATS in WAFO Toolbox
""" """
from scipy.stats import * from scipy.stats import *
from wafo.stats.core import * from wafo.stats.core import *
from wafo.stats.distributions import * from wafo.stats.distributions import *
#from wafo.spectrum.core import SpecData1D
#import wafo.spectrum.models
#import wafo.spectrum.dispersion_relation
#from wafo.data_structures import SpecData1D

@ -1,10 +1,34 @@
from __future__ import division from __future__ import division
import warnings
from wafo.wafodata import WafoData from wafo.wafodata import WafoData
from wafo.misc import findextrema
from scipy import special
import numpy as np import numpy as np
from numpy import inf from numpy import inf
from numpy import atleast_1d from numpy import atleast_1d, nan, ndarray, sqrt, vstack, ones, where, zeros
from numpy import arange, floor from numpy import arange, floor, linspace, asarray, reshape, repeat, product
__all__ = ['edf']
__all__ = ['edf', 'edfcnd']
arr = asarray
def valarray(shape, value=nan, typecode=None):
"""Return an array of all value.
"""
out = reshape(repeat([value], product(shape, axis=0), axis=0), shape)
if typecode is not None:
out = out.astype(typecode)
if not isinstance(out, ndarray):
out = arr(out)
return out
def _invt(q, df):
return special.stdtrit(df, q)
def _invchi2(q, df):
return special.chdtri(df, q)
def edf(x, method=2): def edf(x, method=2):
''' '''
@ -25,7 +49,7 @@ def edf(x, method=2):
>>> x = np.linspace(0,6,200) >>> x = np.linspace(0,6,200)
>>> R = ws.rayleigh.rvs(scale=2,size=100) >>> R = ws.rayleigh.rvs(scale=2,size=100)
>>> F = ws.edf(R) >>> F = ws.edf(R)
>>> F.plot() >>> h = F.plot()
See also edf, pdfplot, cumtrapz See also edf, pdfplot, cumtrapz
''' '''
@ -64,10 +88,10 @@ def edfcnd(x, c=None, method=2):
>>> import wafo.stats as ws >>> import wafo.stats as ws
>>> x = np.linspace(0,6,200) >>> x = np.linspace(0,6,200)
>>> R = ws.rayleigh.rvs(scale=2,size=100) >>> R = ws.rayleigh.rvs(scale=2,size=100)
>>> Fc = ws.edfcd(R, 1) >>> Fc = ws.edfcnd(R, 1)
>>> Fc.plot() >>> hc = Fc.plot()
>>> F = ws.edf(R) >>> F = ws.edf(R)
>>> F.plot() >>> h = F.plot()
See also edf, pdfplot, cumtrapz See also edf, pdfplot, cumtrapz
''' '''
@ -84,3 +108,586 @@ def edfcnd(x, c=None, method=2):
F.labels.ylab = 'F(x| X>=%g)' % c F.labels.ylab = 'F(x| X>=%g)' % c
return F return F
def reslife(data, u=None, umin=None, umax=None, nu=None, nmin=3, alpha=0.05, plotflag=False):
'''
Return Mean Residual Life, i.e., mean excesses vs thresholds
Parameters
---------
data : array_like
vector of data of length N.
u : array-like
threshold values (default linspace(umin, umax, nu))
umin, umax : real scalars
Minimum and maximum threshold, respectively (default min(data), max(data)).
nu : scalar integer
number of threshold values (default min(N-nmin,100))
nmin : scalar integer
Minimum number of extremes to include. (Default 3).
alpha : real scalar
Confidence coefficient (default 0.05)
plotflag: bool
Returns
-------
mrl : WafoData object
Mean residual life values, i.e., mean excesses over thresholds, u.
Notes
-----
RESLIFE estimate mean excesses over thresholds. The purpose of MRL is
to determine the threshold where the upper tail of the data can be
approximated with the generalized Pareto distribution (GPD). The GPD is
appropriate for the tail, if the MRL is a linear function of the
threshold, u. Theoretically in the GPD model
E(X-u0|X>u0) = s0/(1+k)
E(X-u |X>u) = s/(1+k) = (s0 -k*u)/(1+k) for u>u0
where k,s is the shape and scale parameter, respectively.
s0 = scale parameter for threshold u0<u.
Example
-------
>>> import wafo
>>> R = wafo.stats.genpareto.rvs(0.1,2,2,size=100)
>>> mrl = reslife(R,nu=20)
>>> h = mrl.plot()
See also
---------
genpareto
fitgenparrange, disprsnidx
'''
if u is None:
sd = np.sort(data)
n = len(data)
nmin = max(nmin, 0)
if 2 * nmin > n:
warnings.warn('nmin possibly too large!')
sdmax, sdmin = sd[-nmin], sd[0]
umax = sdmax if umax is None else min(umax, sdmax)
umin = sdmin if umin is None else max(umin, sdmin)
if nu is None:
nu = min(n - nmin, 100)
u = linspace(umin, umax, nu)
nu = len(u)
#mrl1 = valarray(nu)
#srl = valarray(nu)
#num = valarray(nu)
mean_and_std = lambda data1 : (data1.mean(), data1.std(), data1.size)
dat = arr(data)
tmp = arr([mean_and_std(dat[dat > tresh] - tresh) for tresh in u.tolist()])
mrl, srl, num = tmp.T
p = 1 - alpha
alpha2 = alpha / 2
# Approximate P% confidence interval
#%Za = -invnorm(alpha2); % known mean
Za = -_invt(alpha2, num - 1) # unknown mean
mrlu = mrl + Za * srl / sqrt(num)
mrll = mrl - Za * srl / sqrt(num)
#options.CI = [mrll,mrlu];
#options.numdata = num;
titleTxt = 'Mean residual life with %d%s CI' % (100 * p, '%')
res = WafoData(mrl, u, xlab='Threshold', ylab='Mean Excess', title=titleTxt)
res.workspace = dict(numdata=num, umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha)
res.children = [WafoData(vstack([mrll, mrlu]).T, u, xlab='Threshold', title=titleTxt)]
res.children_args = [':r']
if plotflag:
res.plot()
return res
def dispersion_idx(data, t=None, u=None, umin=None, umax=None, nu=None, nmin=10, tb=1,
alpha=0.05, plotflag=False):
'''Return Dispersion Index vs threshold
Parameters
----------
data, ti : array_like
data values and sampled times, respectively.
u : array-like
threshold values (default linspace(umin, umax, nu))
umin, umax : real scalars
Minimum and maximum threshold, respectively (default min(data), max(data)).
nu : scalar integer
number of threshold values (default min(N-nmin,100))
nmin : scalar integer
Minimum number of extremes to include. (Default 10).
tb : Real scalar
Block period (same unit as the sampled times) (default 1)
alpha : real scalar
Confidence coefficient (default 0.05)
plotflag: bool
Returns
-------
DI : WafoData object
Dispersion index
b_u : real scalar
threshold where the number of exceedances in a fixed period (Tb) is
consistent with a Poisson process.
ok_u : array-like
all thresholds where the number of exceedances in a fixed period (Tb) is
consistent with a Poisson process.
Notes
------
DISPRSNIDX estimate the Dispersion Index (DI) as function of threshold.
DI measures the homogenity of data and the purpose of DI is to determine
the threshold where the number of exceedances in a fixed period (Tb) is
consistent with a Poisson process. For a Poisson process the DI is one.
Thus the threshold should be so high that DI is not significantly
different from 1.
The Poisson hypothesis is not rejected if the estimated DI is between:
chi2(alpha/2, M-1)/(M-1)< DI < chi^2(1 - alpha/2, M-1 }/(M - 1)
where M is the total number of fixed periods/blocks -generally
the total number of years in the sample.
Example
-------
>>> import wafo.data
>>> xn = wafo.data.sea()
>>> t, data = xn.T
>>> Ie = findpot(data,t,0,5);
>>> di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100)
>>> h = di.plot() # a threshold around 1 seems appropriate.
vline(u)
See also
--------
reslife,
fitgenparrange,
extremal_idx
References
----------
Ribatet, M. A.,(2006),
A User's Guide to the POT Package (Version 1.0)
month = {August},
url = {http://cran.r-project.org/}
Cunnane, C. (1979) Note on the poisson assumption in
partial duration series model. Water Resource Research, 15\bold{(2)}
:489--494.}
'''
# This program is free software; you can redistribute it and/or modify it under the terms of the GNU
# General Public License as published by the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful, but without any warranty; without even
# the implied warranty of merchantability or fitness for a particular purpose. See the GNU General Public
# License for moredetails.
# The GNU General Public License can be obtained from http://www.gnu.org/copyleft/gpl.html. You
# can also obtain it by writing to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
# MA 02111-1307, USA.
n = len(data)
if t is None:
ti = arange(n)
else:
ti = arr(t) - min(t)
t1 = np.empty(ti.shape,dtype=int)
t1[:] = np.floor(ti / tb)
if u is None:
sd = np.sort(data)
nmin = max(nmin, 0)
if 2 * nmin > n:
warnings.warn('nmin possibly too large!')
sdmax, sdmin = sd[-nmin], sd[0]
umax = sdmax if umax is None else min(umax, sdmax)
umin = sdmin if umin is None else max(umin, sdmin)
if nu is None:
nu = min(n - nmin, 100)
u = linspace(umin, umax, nu)
nu = len(u)
di = np.zeros(nu)
d = arr(data)
mint = int(min(t1)) #; % mint should be 0.
maxt = int(max(t1))
M = maxt - mint + 1;
occ = np.zeros(M);
for ix, tresh in enumerate(u.tolist()):
excess = (d > tresh)
lambda_ = excess.sum() / M
for block in range(M):
occ[block] = sum(excess[t1 == block])
di[ix] = occ.var() / lambda_
p = 1 - alpha
diUp = _invchi2(1 - alpha / 2, M - 1) / (M - 1)
diLo = _invchi2(alpha / 2, M - 1) / (M - 1)
# Find appropriate threshold
k1, = np.where((diLo < di) & (di < diUp))
if len(k1) > 0:
ok_u = u[k1]
b_di = (di[k1].mean() < di[k1])
k = b_di.argmax()
b_u = ok_u[k]
else:
b_u = ok_u = None
CItxt = '%d%s CI' % (100 * p, '%')
titleTxt = 'Dispersion Index plot';
res = WafoData(di, u, title=titleTxt, labx='Threshold', laby='Dispersion Index')
#'caption',CItxt);
res.workspace = dict(umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha)
res.children = [WafoData(vstack([diLo * ones(nu), diUp * ones(nu)]).T, u, xlab='Threshold', title=CItxt)]
res.children_args = ['--r']
if plotflag:
res.plot(di)
return res, b_u, ok_u
def decluster(data, t=None, thresh=None, tmin=1):
'''
Return declustered peaks over threshold values
Parameters
----------
data, t : array-like
data-values and sampling-times, respectively.
thresh : real scalar
minimum threshold for levels in data.
tmin : real scalar
minimum distance to another peak [same unit as t] (default 1)
Returns
-------
ev, te : ndarray
extreme values and its corresponding sampling times, respectively, i.e.,
all data > thresh which are at least tmin distance apart.
Example
-------
>>> import pylab
>>> import wafo.data
>>> from wafo.misc import findtc
>>> x = wafo.data.sea()
>>> t, data = x[:400,:].T
>>> itc, iv = findtc(data,0,'dw')
>>> ytc, ttc = data[itc], t[itc]
>>> ymin = 2*data.std()
>>> tmin = 10 # sec
>>> [ye, te] = decluster(ytc,ttc, ymin,tmin);
>>> h = pylab.plot(t,data,ttc,ytc,'ro',t,zeros(len(t)),':',te,ye,'k.')
See also
--------
fitgenpar, findpot, extremalidx
'''
if t is None:
t = np.arange(len(data))
i = findpot(data, t, thresh, tmin)
return data[i], t[i]
def findpot(data, t=None, thresh=None, tmin=1):
'''
Retrun indices to Peaks over threshold values
Parameters
----------
data, t : array-like
data-values and sampling-times, respectively.
thresh : real scalar
minimum threshold for levels in data.
tmin : real scalar
minimum distance to another peak [same unit as t] (default 1)
Returns
-------
Ie : ndarray
indices to extreme values, i.e., all data > tresh which are at least
tmin distance apart.
Example
-------
>>> import pylab
>>> import wafo.data
>>> from wafo.misc import findtc
>>> x = wafo.data.sea()
>>> t, data = x.T
>>> itc, iv = findtc(data,0,'dw')
>>> ytc, ttc = data[itc], t[itc]
>>> ymin = 2*data.std()
>>> tmin = 10 # sec
>>> I = findpot(data, t, ymin, tmin)
>>> yp, tp = data[I], t[I]
>>> Ie = findpot(yp, tp, ymin,tmin)
>>> ye, te = yp[Ie], tp[Ie]
>>> h = pylab.plot(t,data,ttc,ytc,'ro',t,zeros(len(t)),':',te, ye,'k.',tp,yp,'+')
See also
--------
fitgenpar, decluster, extremalidx
'''
Data = arr(data)
if t is None:
ti = np.arange(len(Data))
else:
ti = arr(t)
Ie, = where(Data > thresh);
Ye = Data[Ie]
Te = ti[Ie]
if len(Ye) <= 1:
return Ie
dT = np.diff(Te)
notSorted = np.any(dT < 0);
if notSorted:
I = np.argsort(Te)
Te = Te[I]
Ie = Ie[I]
Ye = Ye[I]
dT = np.diff(Te)
isTooSmall = (dT <= tmin)
if np.any(isTooSmall):
isTooClose = np.hstack((isTooSmall[0], isTooSmall[:-1] | isTooSmall[1:], isTooSmall[-1]))
#Find opening (NO) and closing (NC) index for data beeing to close:
iy = findextrema(np.hstack([0, 0, isTooSmall, 0]))
NO = iy[::2] - 1
NC = iy[1::2]
for no, nc in zip(NO, NC):
iz = slice(no, nc)
iOK = _find_ok_peaks(Ye[iz], Te[iz], tmin)
if len(iOK):
isTooClose[no + iOK] = 0
# Remove data which is too close to other data.
if isTooClose.any():
#len(tooClose)>0:
iOK, = where(1 - isTooClose)
Ie = Ie[iOK]
return Ie
def _find_ok_peaks(Ye, Te, Tmin):
'''
Return indices to the largest maxima that are at least Tmin
distance apart.
'''
Ny = len(Ye)
I = np.argsort(-Ye) # sort in descending order
Te1 = Te[I]
oOrder = zeros(Ny, dtype=int)
oOrder[I] = range(Ny) #indices to the variables original location
isTooClose = zeros(Ny, dtype=bool)
pool = zeros((Ny, 2))
T_range = np.hstack([-Tmin, Tmin])
K = 0
for i, ti in enumerate(Te1):
isTooClose[i] = np.any((pool[:K, 0] <= ti) & (ti <= pool[:K, 1]))
if not isTooClose[i]:
pool[K] = ti + T_range
K += 1
iOK, = where(1 - isTooClose[oOrder])
return iOK
def declustering_time(t):
'''
Returns minimum distance between clusters.
Parameters
----------
t : array-like
sampling times for data.
Returns
-------
tc : real scalar
minimum distance between clusters.
Example
-------
>>> import wafo.data
>>> x = wafo.data.sea()
>>> t, data = x[:400,:].T
>>> Ie = findpot(data,t,0,5);
>>> tc = declustering_time(Ie)
>>> tc
21
'''
t0 = arr(t)
nt = len(t0)
if nt<2:
return arr([])
ti = interexceedance_times(t0)
ei = extremal_idx(ti)
if ei==1:
tc = ti.min()
else:
i = int(np.floor(nt*ei))
sti = -np.sort(-ti)
tc = sti[min(i, nt-2)] #% declustering time
return tc
def interexceedance_times(t):
'''
Returns interexceedance times of data
Parameters
----------
t : array-like
sampling times for data.
Returns
-------
ti : ndarray
interexceedance times
Example
-------
>>> t = [1,2,5,10]
>>> interexceedance_times(t)
array([1, 3, 5])
'''
return np.diff(np.sort(t))
def extremal_idx(ti):
'''
Returns Extremal Index measuring the dependence of data
Parameters
----------
ti : array-like
interexceedance times for data.
Returns
-------
ei : real scalar
Extremal index.
Notes
-----
The Extremal Index (EI) is one if the data are independent and less than
one if there are some dependence. The extremal index can also be intepreted
as the reciprocal of the mean cluster size.
Example
-------
>>> import wafo.data
>>> x = wafo.data.sea()
>>> t, data = x[:400,:].T
>>> Ie = findpot(data,t,0,5);
>>> ti = interexceedance_times(Ie)
>>> ei = extremal_idx(ti)
>>> ei
1
See also
--------
reslife, fitgenparrange, disprsnidx, findpot, decluster
Reference
---------
Christopher A. T. Ferro, Johan Segers (2003)
Inference for clusters of extreme values
Journal of the Royal Statistical society: Series B (Statistical Methodology) 54 (2), 545-556
doi:10.1111/1467-9868.00401
'''
t = arr(ti)
tmax = t.max()
if tmax<=1:
ei = 0
elif tmax<=2:
ei = min(1, 2*t.mean()**2/((t**2).mean()))
else:
ei = min(1, 2*np.mean(t-1)**2/np.mean((t-1)*(t-2)))
return ei
def _test_dispersion_idx():
import wafo.data
xn = wafo.data.sea()
t, data = xn.T
Ie = findpot(data,t,0,5);
di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100)
di.plot() # a threshold around 1 seems appropriate.
di.show()
pass
def _test_findpot():
import pylab
import wafo.data
from wafo.misc import findtc
x = wafo.data.sea()
t, data = x[:, :].T
itc, iv = findtc(data, 0, 'dw')
ytc, ttc = data[itc], t[itc]
ymin = 2 * data.std()
tmin = 10 # sec
I = findpot(data, t, ymin, tmin)
yp, tp = data[I], t[I]
Ie = findpot(yp, tp, ymin, tmin)
ye, te = yp[Ie], tp[Ie]
h = pylab.plot(t, data, ttc,ytc,'ro', t, zeros(len(t)), ':', te, ye, 'kx', tp, yp, '+')
pylab.show() #
pass
def _test_reslife():
import wafo
R = wafo.stats.genpareto.rvs(0.1, 2, 2, size=100)
mrl = reslife(R, nu=20)
mrl.plot()
def main():
#_test_dispersion_idx()
import doctest
doctest.testmod()
if __name__ == '__main__':
main()

@ -400,15 +400,15 @@ class rv_frozen(object):
def stats(self, moments='mv'): def stats(self, moments='mv'):
''' Some statistics of the given RV''' ''' Some statistics of the given RV'''
kwds = dict(moments=moments) kwds = dict(moments=moments)
return self.dist.stats(*self.par, **kwds) return self.dist.stats(*self.par)
def median(self): def median(self):
return self.dist.median(*self.par, **self.kwds) return self.dist.median(*self.par)
def mean(self): def mean(self):
return self.dist.mean(*self.par,**self.kwds) return self.dist.mean(*self.par)
def var(self): def var(self):
return self.dist.var(*self.par, **self.kwds) return self.dist.var(*self.par)
def std(self): def std(self):
return self.dist.std(*self.par, **self.kwds) return self.dist.std(*self.par)
def moment(self, n): def moment(self, n):
par1 = self.par[:self.dist.numargs] par1 = self.par[:self.dist.numargs]
return self.dist.moment(n, *par1) return self.dist.moment(n, *par1)
@ -418,7 +418,7 @@ class rv_frozen(object):
'''Probability mass function at k of the given RV''' '''Probability mass function at k of the given RV'''
return self.dist.pmf(k, *self.par) return self.dist.pmf(k, *self.par)
def interval(self,alpha): def interval(self,alpha):
return self.dist.interval(alpha, *self.par, **self.kwds) return self.dist.interval(alpha, *self.par)
# Frozen RV class # Frozen RV class
@ -1982,10 +1982,8 @@ class rv_continuous(rv_generic):
def link(self, x, logSF, theta, i): def link(self, x, logSF, theta, i):
''' Return dist. par. no. i as function of quantile (x) and log survival probability (sf) ''' Return dist. par. no. i as function of quantile (x) and log survival probability (sf)
where
Assumptions: theta is the list containing all parameters including location and scale.
------------
theta is list containing all parameters including location and scale.
''' '''
raise ValueError('Link function not implemented for the %s distribution' % self.name) raise ValueError('Link function not implemented for the %s distribution' % self.name)
return None return None
@ -2077,7 +2075,7 @@ class rv_continuous(rv_generic):
# invert the Hessian matrix (i.e. invert the observed information number) # invert the Hessian matrix (i.e. invert the observed information number)
#pcov = -pinv(H); #pcov = -pinv(H);
return - H return -H
# return starting point for fit (shape arguments + loc + scale) # return starting point for fit (shape arguments + loc + scale)
def _fitstart(self, data, args=None): def _fitstart(self, data, args=None):
@ -2231,25 +2229,6 @@ class rv_continuous(rv_generic):
in your namespace will be sorted as well. in your namespace will be sorted as well.
''' '''
return FitDistribution(self, data, *args, **kwds) return FitDistribution(self, data, *args, **kwds)
# loc0, scale0, method = map(kwds.get, ['loc', 'scale','method'],[none, none,'ml'])
# args, loc0, scale0 = self.fix_loc_scale(args, loc0, scale0)
# Narg = len(args)
# if Narg != self.numargs:
# if Narg > self.numargs:
# raise ValueError, "Too many input arguments."
# else:
# args += (1.0,)*(self.numargs-Narg)
# # location and scale are at the end
# x0 = args + (loc0, scale0)
# if method.lower()[:].startswith('mps'):
# data.sort()
# fitfun = self.nlogps
# else:
# fitfun = self.nnlf
#
# return optimize.fmin(fitfun,x0,args=(ravel(data),),disp=0)
def fit_loc_scale(self, data, *args): def fit_loc_scale(self, data, *args):
""" """
@ -3387,6 +3366,20 @@ class genpareto_gen(rv_continuous):
#vals = 1.0/c * (pow(1-q, -c)-1) #vals = 1.0/c * (pow(1-q, -c)-1)
#return vals #return vals
def _fitstart(self, data):
d = arr(data)
loc = d.min()-0.01*d.std()
#moments estimator
d1 = d-loc
m = d1.mean()
s = d1.std()
shape = ((m/s)**2 - 1)/2
scale = m*((m/s)**2+1)/2
return shape, loc, scale
def hessian_nnlf(self, theta, x, eps=None): def hessian_nnlf(self, theta, x, eps=None):
try: try:
loc = theta[-2] loc = theta[-2]
@ -6876,6 +6869,10 @@ Skellam distribution
def main(): def main():
import matplotlib import matplotlib
matplotlib.interactive(True) matplotlib.interactive(True)
R = norm.rvs(size=100)
phat = norm.fit(R)
phat = genpareto.fit(R[R>0.7],f0=0.1, floc=0.7)
#nbinom(10, 0.75).rvs(3) #nbinom(10, 0.75).rvs(3)
t = bernoulli(0.75).rvs(3) t = bernoulli(0.75).rvs(3)
x = np.r_[5, 10] x = np.r_[5, 10]

@ -10,21 +10,18 @@ from __future__ import division
from wafo.plotbackend import plotbackend from wafo.plotbackend import plotbackend
from wafo.misc import ecross, findcross from wafo.misc import ecross, findcross
from scipy.misc.ppimport import ppimport #from scipy.misc.ppimport import ppimport
import numdifftools import numdifftools
from scipy import special from scipy import special
from scipy.linalg import pinv2 from scipy.linalg import pinv2
from scipy import optimize from scipy import optimize
from numpy import alltrue, arange, \
ravel, ones, sum, \
zeros, log, sqrt, exp
from numpy import atleast_1d, any, asarray, nan, inf, pi, reshape, repeat, product, ndarray
import numpy import numpy
import numpy as np import numpy as np
from numpy import alltrue, arange, ravel, ones, sum, zeros, log, sqrt, exp
from numpy import (atleast_1d, any, asarray, nan, pi, reshape, repeat,
product, ndarray, isfinite)
from numpy import flatnonzero as nonzero from numpy import flatnonzero as nonzero
@ -33,11 +30,8 @@ __all__ = [
] ]
floatinfo = np.finfo(float) floatinfo = np.finfo(float)
#arr = atleast_1d #arr = atleast_1d
arr = asarray arr = asarray
all = alltrue all = alltrue
def chi2isf(p, df): def chi2isf(p, df):
@ -123,13 +117,13 @@ class rv_frozen(object):
kwds = dict(moments=moments) kwds = dict(moments=moments)
return self.dist.stats(*self.par, **kwds) return self.dist.stats(*self.par, **kwds)
def median(self): def median(self):
return self.dist.median(*self.par, **self.kwds) return self.dist.median(*self.par)
def mean(self): def mean(self):
return self.dist.mean(*self.par,**self.kwds) return self.dist.mean(*self.par)
def var(self): def var(self):
return self.dist.var(*self.par, **self.kwds) return self.dist.var(*self.par)
def std(self): def std(self):
return self.dist.std(*self.par, **self.kwds) return self.dist.std(*self.par)
def moment(self, n): def moment(self, n):
par1 = self.par[:self.dist.numargs] par1 = self.par[:self.dist.numargs]
return self.dist.moment(n, *par1) return self.dist.moment(n, *par1)
@ -139,9 +133,7 @@ class rv_frozen(object):
'''Probability mass function at k of the given RV''' '''Probability mass function at k of the given RV'''
return self.dist.pmf(k, *self.par) return self.dist.pmf(k, *self.par)
def interval(self,alpha): def interval(self,alpha):
return self.dist.interval(alpha, *self.par, **self.kwds) return self.dist.interval(alpha, *self.par)
# internal class to profile parameters of a given distribution # internal class to profile parameters of a given distribution
@ -261,9 +253,9 @@ class Profile(object):
self.i_free = nonzero(isfree) self.i_free = nonzero(isfree)
self.Lmax = Lmax self.Lmax = Lmax
self.alpha_Lrange = 0.5 * chi2isf(self.alpha, 1) #_WAFODIST.chi2.isf(self.alpha, 1) self.alpha_Lrange = 0.5 * chi2isf(self.alpha, 1)
self.alpha_cross_level = Lmax - self.alpha_Lrange self.alpha_cross_level = Lmax - self.alpha_Lrange
lowLevel = self.alpha_cross_level - self.alpha_Lrange / 7.0 #lowLevel = self.alpha_cross_level - self.alpha_Lrange / 7.0
## Check that par are actually at the optimum ## Check that par are actually at the optimum
phatv = fit_dist.par.copy() phatv = fit_dist.par.copy()
@ -327,7 +319,7 @@ class Profile(object):
cond = self.data == -numpy.inf cond = self.data == -numpy.inf
if any(cond): if any(cond):
ind, = cond.nonzero() ind, = cond.nonzero()
self.data.put(ind, numpy.finfo(float).min / 2.0) self.data.put(ind, floatinfo.min / 2.0)
ind1 = numpy.where(ind == 0, ind, ind - 1) ind1 = numpy.where(ind == 0, ind, ind - 1)
cl = self.alpha_cross_level - self.alpha_Lrange / 2.0 cl = self.alpha_cross_level - self.alpha_Lrange / 2.0
t0 = ecross(self.args, self.data, ind1, cl) t0 = ecross(self.args, self.data, ind1, cl)
@ -446,16 +438,17 @@ class FitDistribution(rv_frozen):
def __init__(self, dist, data, *args, **kwds): def __init__(self, dist, data, *args, **kwds):
extradoc = ''' extradoc = '''
RV.plotfitsumry() - Plot various diagnostic plots to asses quality of fit. RV.plotfitsumry() - Plot various diagnostic plots to asses quality of fit.
RV.plotecdf() - Plot Empirical and fitted Cumulative Distribution Function RV.plotecdf() - Plot Empirical and fitted Cumulative Distribution Function
RV.plotesf() - Plot Empirical and fitted Survival Function RV.plotesf() - Plot Empirical and fitted Survival Function
RV.plotepdf() - Plot Empirical and fitted Probability Distribution Function RV.plotepdf() - Plot Empirical and fitted Probability Distribution Function
RV.plotresq() - Displays a residual quantile plot. RV.plotresq() - Displays a residual quantile plot.
RV.plotresprb() - Displays a residual probability plot. RV.plotresprb() - Displays a residual probability plot.
RV.profile() - Return Profile Log- likelihood or Product Spacing-function. RV.profile() - Return Profile Log- likelihood or Product Spacing-function.
Member variables: Member variables
----------------
data - data used in fitting data - data used in fitting
alpha - confidence coefficient alpha - confidence coefficient
method - method used method - method used
@ -475,86 +468,144 @@ class FitDistribution(rv_frozen):
self.__doc__ = rv_frozen.__doc__ + extradoc self.__doc__ = rv_frozen.__doc__ + extradoc
self.dist = dist self.dist = dist
numargs = dist.numargs numargs = dist.numargs
self.method, self.alpha, self.par_fix, self.search, self.copydata = map(kwds.get, ['method', 'alpha', 'par_fix', 'search', 'copydata'], ['ml', 0.05, None, True, True]) self.method=self.alpha=self.par_fix=self.search=self.copydata=None
self.data = ravel(data) m_variables = ['method', 'alpha', 'par_fix', 'search', 'copydata']
if self.copydata: m_defaults = ['ml', 0.05, None, True, True]
self.data = self.data.copy() for (name, val) in zip(m_variables,m_defaults):
self.data.sort() setattr(self, name, kwds.get(name,val))
#self.method, self.alpha, self.par_fix, self.search, self.copydata = map(kwds.get, m_variables, m_defaults)
if self.method.lower()[:].startswith('mps'): if self.method.lower()[:].startswith('mps'):
self._fitfun = dist.nlogps self._fitfun = dist.nlogps
else: else:
self._fitfun = dist.nnlf self._fitfun = dist.nnlf
allfixed = False self.data = ravel(data)
isfinite = numpy.isfinite if self.copydata:
somefixed = (self.par_fix != None) and any(isfinite(self.par_fix)) self.data = self.data.copy()
self.data.sort()
par, fixedn = self._fit(*args, **kwds)
self.par = arr(par)
somefixed = len(fixedn)>0
if somefixed: if somefixed:
fitfun = self._fxfitfun self.par_fix = [nan,]*len(self.par)
self.par_fix = tuple(self.par_fix) for i in fixedn:
allfixed = all(isfinite(self.par_fix)) self.par_fix[i] = self.par[i]
self.par = atleast_1d(self.par_fix)
self.i_notfixed = nonzero(1 - isfinite(self.par)) self.i_notfixed = nonzero(1 - isfinite(self.par_fix))
self.i_fixed = nonzero(isfinite(self.par)) self.i_fixed = nonzero(isfinite(self.par_fix))
if len(self.par) != numargs + 2:
raise ValueError, "Wrong number of input arguments." numpar = numargs + 2
if len(args) != len(self.i_notfixed): self.par_cov = zeros((numpar, numpar))
raise ValueError("Length of args must equal number of non-fixed parameters given in par_fix! (%d) " % len(self.i_notfixed)) self._compute_cov()
x0 = atleast_1d(args)
else: # Set confidence interval for parameters
fitfun = self.fitfun pvar = numpy.diag(self.par_cov)
loc0, scale0 = map(kwds.get, ['loc', 'scale']) zcrit = -norm_ppf(self.alpha / 2.0)
args, loc0, scale0 = dist.fix_loc_scale(args, loc0, scale0) self.par_lower = self.par - zcrit * sqrt(pvar)
Narg = len(args) self.par_upper = self.par + zcrit * sqrt(pvar)
if Narg != numargs:
if Narg > numargs:
raise ValueError, "Too many input arguments."
else:
args += (1.0,)*(numargs - Narg)
# location and scale are at the end
x0 = args + (loc0, scale0)
x0 = atleast_1d(x0)
numpar = len(x0)
if self.search and not allfixed:
#args=(self.data,),
par = optimize.fmin(fitfun, x0, disp=0)
if not somefixed:
self.par = par
elif (not allfixed) and somefixed:
self.par[self.i_notfixed] = x0
else:
self.par = x0
np = numargs + 2
self.par_upper = None
self.par_lower = None
self.par_cov = zeros((np, np))
self.LLmax = -dist.nnlf(self.par, self.data) self.LLmax = -dist.nnlf(self.par, self.data)
self.LPSmax = -dist.nlogps(self.par, self.data) self.LPSmax = -dist.nlogps(self.par, self.data)
self.pvalue = self._pvalue(self.par, self.data, unknown_numpar=numpar) self.pvalue = self._pvalue(self.par, self.data, unknown_numpar=numpar)
H = numpy.asmatrix(self._hessian_nnlf(self.par, self.data))
def _reduce_func(self, args, kwds):
args = list(args)
Nargs = len(args) - 2
fixedn = []
index = range(Nargs) + [-2, -1]
names = ['f%d' % n for n in range(Nargs)] + ['floc', 'fscale']
x0 = args[:]
for n, key in zip(index, names):
if kwds.has_key(key):
fixedn.append(n)
args[n] = kwds[key]
del x0[n]
fitfun = self._fitfun
if len(fixedn) == 0:
func = fitfun
restore = None
else:
if len(fixedn) == len(index):
raise ValueError, "All parameters fixed. There is nothing to optimize."
def restore(args, theta):
# Replace with theta for all numbers not in fixedn
# This allows the non-fixed values to vary, but
# we still call self.nnlf with all parameters.
i = 0
for n in range(Nargs):
if n not in fixedn:
args[n] = theta[i]
i += 1
return args
def func(theta, x):
newtheta = restore(args[:], theta)
return fitfun(newtheta, x)
return x0, func, restore, args, fixedn
def _fit(self, *args, **kwds):
dist = self.dist
data = self.data
Narg = len(args)
if Narg > dist.numargs:
raise ValueError, "Too many input arguments."
start = [None]*2
if (Narg < dist.numargs) or not (kwds.has_key('loc') and
kwds.has_key('scale')):
start = dist._fitstart(data) # get distribution specific starting locations
args += start[Narg:-2]
loc = kwds.get('loc', start[-2])
scale = kwds.get('scale', start[-1])
args += (loc, scale)
x0, func, restore, args, fixedn = self._reduce_func(args, kwds)
if self.search:
optimizer = kwds.get('optimizer', optimize.fmin)
# convert string to function in scipy.optimize
if not callable(optimizer) and isinstance(optimizer, (str, unicode)):
if not optimizer.startswith('fmin_'):
optimizer = "fmin_"+optimizer
if optimizer == 'fmin_':
optimizer = 'fmin'
try:
optimizer = getattr(optimize, optimizer)
except AttributeError:
raise ValueError, "%s is not a valid optimizer" % optimizer
vals = optimizer(func,x0,args=(ravel(data),),disp=0)
vals = tuple(vals)
else:
vals = tuple(x0)
if restore is not None:
vals = restore(args, vals)
return vals, fixedn
def _compute_cov(self):
'''Compute covariance
'''
somefixed = (self.par_fix != None) and any(isfinite(self.par_fix))
H = numpy.asmatrix(self.dist.hessian_nnlf(self.par, self.data))
self.H = H self.H = H
try: try:
if allfixed: if somefixed:
pass allfixed = all(isfinite(self.par_fix))
elif somefixed: if allfixed:
pcov = -pinv2(H[self.i_notfixed, :][..., self.i_notfixed]) self.par_cov[:,:]=0
for row, ix in enumerate(list(self.i_notfixed)): else:
self.par_cov[ix, self.i_notfixed] = pcov[row, :] pcov = -pinv2(H[self.i_notfixed, :][..., self.i_notfixed])
for row, ix in enumerate(list(self.i_notfixed)):
self.par_cov[ix, self.i_notfixed] = pcov[row, :]
else: else:
self.par_cov = -pinv2(H) self.par_cov = -pinv2(H)
except: except:
self.par_cov[:, :] = nan self.par_cov[:, :] = nan
pvar = numpy.diag(self.par_cov)
zcrit = -norm_ppf(self.alpha / 2.0)#_WAFODIST.norm.ppf(self.alpha / 2.0)
self.par_lower = self.par - zcrit * sqrt(pvar)
self.par_upper = self.par + zcrit * sqrt(pvar)
def fitfun(self, phat): def fitfun(self, phat):
return self._fitfun(phat, self.data) return self._fitfun(phat, self.data)
@ -562,7 +613,6 @@ class FitDistribution(rv_frozen):
self.par[self.i_notfixed] = phat10 self.par[self.i_notfixed] = phat10
return self._fitfun(self.par, self.data) return self._fitfun(self.par, self.data)
def profile(self, **kwds): def profile(self, **kwds):
''' Profile Log- likelihood or Log Product Spacing- function, ''' Profile Log- likelihood or Log Product Spacing- function,
which can be used for constructing confidence interval for which can be used for constructing confidence interval for
@ -812,65 +862,7 @@ class FitDistribution(rv_frozen):
return pvalue return pvalue
def _hessian_nnlf(self, theta, data, eps=None):
''' approximate hessian of nnlf where theta are the parameters (including loc and scale)
'''
#Nd = len(x)
np = len(theta)
# pab 07.01.2001: Always choose the stepsize h so that
# it is an exactly representable number.
# This is important when calculating numerical derivatives and is
# accomplished by the following.
if eps == None:
eps = (floatinfo.machar.eps) ** 0.4
#xmin = floatinfo.machar.xmin
#myfun = lambda y: max(y,100.0*log(xmin)) #% trick to avoid log of zero
delta = (eps + 2.0) - 2.0
delta2 = delta ** 2.0
# % Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with
# % 1/(d^2 L(theta|x)/dtheta^2)
# % using central differences
dist = self.dist
LL = dist.nnlf(theta, data)
H = zeros((np, np)) #%% Hessian matrix
theta = tuple(theta)
for ix in xrange(np):
sparam = list(theta)
sparam[ix] = theta[ix] + delta
fp = dist.nnlf(sparam, data)
#fp = sum(myfun(x))
sparam[ix] = theta[ix] - delta
fm = dist.nnlf(sparam, data)
#fm = sum(myfun(x))
H[ix, ix] = (fp - 2 * LL + fm) / delta2
for iy in range(ix + 1, np):
sparam[ix] = theta[ix] + delta
sparam[iy] = theta[iy] + delta
fpp = dist.nnlf(sparam, data)
#fpp = sum(myfun(x))
sparam[iy] = theta[iy] - delta
fpm = dist.nnlf(sparam, data)
#fpm = sum(myfun(x))
sparam[ix] = theta[ix] - delta
fmm = dist.nnlf(sparam, data)
#fmm = sum(myfun(x));
sparam[iy] = theta[iy] + delta
fmp = dist.nnlf(sparam, data)
#fmp = sum(myfun(x))
H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2)
H[iy, ix] = H[ix, iy]
sparam[iy] = theta[iy];
# invert the Hessian matrix (i.e. invert the observed information number)
#pcov = -pinv(H);
return - H
def main(): def main():
_WAFODIST = ppimport('wafo.stats.distributions') _WAFODIST = ppimport('wafo.stats.distributions')

@ -52,39 +52,41 @@ class WafoData(object):
>>> h = d2.plot() >>> h = d2.plot()
Plot with confidence interval Plot with confidence interval
d3 = wdata(sin(x),x) >>> d3 = WafoData(np.sin(x),x)
d3 = set(d3,'dataCI',[sin(x(:))*0.9 sin(x(:))*1.2]) >>> d3.children = [WafoData(np.vstack([np.sin(x)*0.9, np.sin(x)*1.2]).T,x)]
plot(d3) >>> d3.children_args=[':r']
>>> h = d3.plot()
See also See also
-------- --------
wdata/plot,
specdata, specdata,
covdata covdata
''' '''
def __init__(self, data=None, args=None,*args2,**kwds): def __init__(self, data=None, args=None, *args2, **kwds):
self.data = data self.data = data
self.args = args self.args = args
self.date = now() self.date = now()
self.plotter = None self.plotter = None
self.children = None self.children = None
self.children_args = []
self.labels = AxisLabels(**kwds) self.labels = AxisLabels(**kwds)
self.setplotter() self.setplotter()
def plot(self,*args,**kwds): def plot(self, *args, **kwds):
tmp = None tmp = None
if self.children!=None: if self.children != None:
plotbackend.hold('on') plotbackend.hold('on')
tmp = [] tmp = []
child_args = args + tuple(self.children_args)
for child in self.children: for child in self.children:
tmp1 = child.plot(*args, **kwds) tmp1 = child.plot(*child_args, **kwds)
if tmp1 !=None: if tmp1 != None:
tmp.append(tmp1) tmp.append(tmp1)
if len(tmp)==0: if len(tmp) == 0:
tmp = None tmp = None
tmp2 = self.plotter.plot(self,*args,**kwds) tmp2 = self.plotter.plot(self, *args, **kwds)
return tmp2,tmp return tmp2, tmp
def show(self): def show(self):
self.plotter.show() self.plotter.show()
@ -95,20 +97,20 @@ class WafoData(object):
return newcopy return newcopy
def setplotter(self,plotmethod=None): def setplotter(self, plotmethod=None):
''' '''
Set plotter based on the data type data_1d, data_2d, data_3d or data_nd Set plotter based on the data type data_1d, data_2d, data_3d or data_nd
''' '''
if isinstance(self.args,(list,tuple)): # Multidimensional data if isinstance(self.args, (list, tuple)): # Multidimensional data
ndim = len(self.args) ndim = len(self.args)
if ndim<2: if ndim < 2:
warnings.warn('Unable to determine plotter-type, because len(self.args)<2.') warnings.warn('Unable to determine plotter-type, because len(self.args)<2.')
print('If the data is 1D, then self.args should be a vector!') print('If the data is 1D, then self.args should be a vector!')
print('If the data is 2D, then length(self.args) should be 2.') print('If the data is 2D, then length(self.args) should be 2.')
print('If the data is 3D, then length(self.args) should be 3.') print('If the data is 3D, then length(self.args) should be 3.')
print('Unless you fix this, the plot methods will not work!') print('Unless you fix this, the plot methods will not work!')
elif ndim==2: elif ndim == 2:
self.plotter = Plotter_2d(plotmethod) self.plotter = Plotter_2d(plotmethod)
else: else:
warnings.warn('Plotter method not implemented for ndim>2') warnings.warn('Plotter method not implemented for ndim>2')
@ -118,7 +120,7 @@ class WafoData(object):
class AxisLabels: class AxisLabels:
def __init__(self,title='',xlab='',ylab='',zlab='',**kwds): def __init__(self, title='', xlab='', ylab='', zlab='', **kwds):
self.title = title self.title = title
self.xlab = xlab self.xlab = xlab
self.ylab = ylab self.ylab = ylab
@ -137,7 +139,7 @@ class AxisLabels:
h2 = plotbackend.xlabel(self.xlab) h2 = plotbackend.xlabel(self.xlab)
h3 = plotbackend.ylabel(self.ylab) h3 = plotbackend.ylabel(self.ylab)
#h4 = plotbackend.zlabel(self.zlab) #h4 = plotbackend.zlabel(self.zlab)
return h1,h2,h3 return h1, h2, h3
except: except:
pass pass
@ -159,7 +161,7 @@ class Plotter_1d(object):
scatter : scatter plot scatter : scatter plot
""" """
def __init__(self,plotmethod='plot'): def __init__(self, plotmethod='plot'):
self.plotfun = None self.plotfun = None
if plotmethod is None: if plotmethod is None:
plotmethod = 'plot' plotmethod = 'plot'
@ -172,14 +174,14 @@ class Plotter_1d(object):
def show(self): def show(self):
plotbackend.show() plotbackend.show()
def plot(self,wdata,*args,**kwds): def plot(self, wdata, *args, **kwds):
if isinstance(wdata.args,(list,tuple)): if isinstance(wdata.args, (list, tuple)):
args1 = tuple((wdata.args))+(wdata.data,)+args args1 = tuple((wdata.args)) + (wdata.data,) + args
else: else:
args1 = tuple((wdata.args,))+(wdata.data,)+args args1 = tuple((wdata.args,)) + (wdata.data,) + args
h1 = self.plotfun(*args1,**kwds) h1 = self.plotfun(*args1, **kwds)
h2 = wdata.labels.labelfig() h2 = wdata.labels.labelfig()
return h1,h2 return h1, h2
class Plotter_2d(Plotter_1d): class Plotter_2d(Plotter_1d):
""" """
@ -192,10 +194,10 @@ class Plotter_2d(Plotter_1d):
surf surf
""" """
def __init__(self,plotmethod='contour'): def __init__(self, plotmethod='contour'):
if plotmethod is None: if plotmethod is None:
plotmethod = 'contour' plotmethod = 'contour'
super(Plotter_2d,self).__init__(plotmethod) super(Plotter_2d, self).__init__(plotmethod)
#self.plotfun = plotbackend.__dict__[plotmethod] #self.plotfun = plotbackend.__dict__[plotmethod]
@ -207,4 +209,4 @@ if __name__ == '__main__':
import doctest import doctest
doctest.testmod() doctest.testmod()
else: else:
main() main()

Loading…
Cancel
Save