Added functions to stats.core.py

-reslife
-extremal_idx
-++
Improved estimation.py
master
Per.Andreas.Brodtkorb 15 years ago
parent a490e23e2f
commit df05bec641

@ -7,7 +7,3 @@ Statistics package in WAFO Toolbox.
from scipy.stats import *
from wafo.stats.core import *
from wafo.stats.distributions import *
#from wafo.spectrum.core import SpecData1D
#import wafo.spectrum.models
#import wafo.spectrum.dispersion_relation
#from wafo.data_structures import SpecData1D

@ -1,10 +1,34 @@
from __future__ import division
import warnings
from wafo.wafodata import WafoData
from wafo.misc import findextrema
from scipy import special
import numpy as np
from numpy import inf
from numpy import atleast_1d
from numpy import arange, floor
__all__ = ['edf']
from numpy import atleast_1d, nan, ndarray, sqrt, vstack, ones, where, zeros
from numpy import arange, floor, linspace, asarray, reshape, repeat, product
__all__ = ['edf', 'edfcnd']
arr = asarray
def valarray(shape, value=nan, typecode=None):
"""Return an array of all value.
"""
out = reshape(repeat([value], product(shape, axis=0), axis=0), shape)
if typecode is not None:
out = out.astype(typecode)
if not isinstance(out, ndarray):
out = arr(out)
return out
def _invt(q, df):
return special.stdtrit(df, q)
def _invchi2(q, df):
return special.chdtri(df, q)
def edf(x, method=2):
'''
@ -25,7 +49,7 @@ def edf(x, method=2):
>>> x = np.linspace(0,6,200)
>>> R = ws.rayleigh.rvs(scale=2,size=100)
>>> F = ws.edf(R)
>>> F.plot()
>>> h = F.plot()
See also edf, pdfplot, cumtrapz
'''
@ -64,10 +88,10 @@ def edfcnd(x, c=None, method=2):
>>> import wafo.stats as ws
>>> x = np.linspace(0,6,200)
>>> R = ws.rayleigh.rvs(scale=2,size=100)
>>> Fc = ws.edfcd(R, 1)
>>> Fc.plot()
>>> Fc = ws.edfcnd(R, 1)
>>> hc = Fc.plot()
>>> F = ws.edf(R)
>>> F.plot()
>>> h = F.plot()
See also edf, pdfplot, cumtrapz
'''
@ -84,3 +108,586 @@ def edfcnd(x, c=None, method=2):
F.labels.ylab = 'F(x| X>=%g)' % c
return F
def reslife(data, u=None, umin=None, umax=None, nu=None, nmin=3, alpha=0.05, plotflag=False):
'''
Return Mean Residual Life, i.e., mean excesses vs thresholds
Parameters
---------
data : array_like
vector of data of length N.
u : array-like
threshold values (default linspace(umin, umax, nu))
umin, umax : real scalars
Minimum and maximum threshold, respectively (default min(data), max(data)).
nu : scalar integer
number of threshold values (default min(N-nmin,100))
nmin : scalar integer
Minimum number of extremes to include. (Default 3).
alpha : real scalar
Confidence coefficient (default 0.05)
plotflag: bool
Returns
-------
mrl : WafoData object
Mean residual life values, i.e., mean excesses over thresholds, u.
Notes
-----
RESLIFE estimate mean excesses over thresholds. The purpose of MRL is
to determine the threshold where the upper tail of the data can be
approximated with the generalized Pareto distribution (GPD). The GPD is
appropriate for the tail, if the MRL is a linear function of the
threshold, u. Theoretically in the GPD model
E(X-u0|X>u0) = s0/(1+k)
E(X-u |X>u) = s/(1+k) = (s0 -k*u)/(1+k) for u>u0
where k,s is the shape and scale parameter, respectively.
s0 = scale parameter for threshold u0<u.
Example
-------
>>> import wafo
>>> R = wafo.stats.genpareto.rvs(0.1,2,2,size=100)
>>> mrl = reslife(R,nu=20)
>>> h = mrl.plot()
See also
---------
genpareto
fitgenparrange, disprsnidx
'''
if u is None:
sd = np.sort(data)
n = len(data)
nmin = max(nmin, 0)
if 2 * nmin > n:
warnings.warn('nmin possibly too large!')
sdmax, sdmin = sd[-nmin], sd[0]
umax = sdmax if umax is None else min(umax, sdmax)
umin = sdmin if umin is None else max(umin, sdmin)
if nu is None:
nu = min(n - nmin, 100)
u = linspace(umin, umax, nu)
nu = len(u)
#mrl1 = valarray(nu)
#srl = valarray(nu)
#num = valarray(nu)
mean_and_std = lambda data1 : (data1.mean(), data1.std(), data1.size)
dat = arr(data)
tmp = arr([mean_and_std(dat[dat > tresh] - tresh) for tresh in u.tolist()])
mrl, srl, num = tmp.T
p = 1 - alpha
alpha2 = alpha / 2
# Approximate P% confidence interval
#%Za = -invnorm(alpha2); % known mean
Za = -_invt(alpha2, num - 1) # unknown mean
mrlu = mrl + Za * srl / sqrt(num)
mrll = mrl - Za * srl / sqrt(num)
#options.CI = [mrll,mrlu];
#options.numdata = num;
titleTxt = 'Mean residual life with %d%s CI' % (100 * p, '%')
res = WafoData(mrl, u, xlab='Threshold', ylab='Mean Excess', title=titleTxt)
res.workspace = dict(numdata=num, umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha)
res.children = [WafoData(vstack([mrll, mrlu]).T, u, xlab='Threshold', title=titleTxt)]
res.children_args = [':r']
if plotflag:
res.plot()
return res
def dispersion_idx(data, t=None, u=None, umin=None, umax=None, nu=None, nmin=10, tb=1,
alpha=0.05, plotflag=False):
'''Return Dispersion Index vs threshold
Parameters
----------
data, ti : array_like
data values and sampled times, respectively.
u : array-like
threshold values (default linspace(umin, umax, nu))
umin, umax : real scalars
Minimum and maximum threshold, respectively (default min(data), max(data)).
nu : scalar integer
number of threshold values (default min(N-nmin,100))
nmin : scalar integer
Minimum number of extremes to include. (Default 10).
tb : Real scalar
Block period (same unit as the sampled times) (default 1)
alpha : real scalar
Confidence coefficient (default 0.05)
plotflag: bool
Returns
-------
DI : WafoData object
Dispersion index
b_u : real scalar
threshold where the number of exceedances in a fixed period (Tb) is
consistent with a Poisson process.
ok_u : array-like
all thresholds where the number of exceedances in a fixed period (Tb) is
consistent with a Poisson process.
Notes
------
DISPRSNIDX estimate the Dispersion Index (DI) as function of threshold.
DI measures the homogenity of data and the purpose of DI is to determine
the threshold where the number of exceedances in a fixed period (Tb) is
consistent with a Poisson process. For a Poisson process the DI is one.
Thus the threshold should be so high that DI is not significantly
different from 1.
The Poisson hypothesis is not rejected if the estimated DI is between:
chi2(alpha/2, M-1)/(M-1)< DI < chi^2(1 - alpha/2, M-1 }/(M - 1)
where M is the total number of fixed periods/blocks -generally
the total number of years in the sample.
Example
-------
>>> import wafo.data
>>> xn = wafo.data.sea()
>>> t, data = xn.T
>>> Ie = findpot(data,t,0,5);
>>> di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100)
>>> h = di.plot() # a threshold around 1 seems appropriate.
vline(u)
See also
--------
reslife,
fitgenparrange,
extremal_idx
References
----------
Ribatet, M. A.,(2006),
A User's Guide to the POT Package (Version 1.0)
month = {August},
url = {http://cran.r-project.org/}
Cunnane, C. (1979) Note on the poisson assumption in
partial duration series model. Water Resource Research, 15\bold{(2)}
:489--494.}
'''
# This program is free software; you can redistribute it and/or modify it under the terms of the GNU
# General Public License as published by the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful, but without any warranty; without even
# the implied warranty of merchantability or fitness for a particular purpose. See the GNU General Public
# License for moredetails.
# The GNU General Public License can be obtained from http://www.gnu.org/copyleft/gpl.html. You
# can also obtain it by writing to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
# MA 02111-1307, USA.
n = len(data)
if t is None:
ti = arange(n)
else:
ti = arr(t) - min(t)
t1 = np.empty(ti.shape,dtype=int)
t1[:] = np.floor(ti / tb)
if u is None:
sd = np.sort(data)
nmin = max(nmin, 0)
if 2 * nmin > n:
warnings.warn('nmin possibly too large!')
sdmax, sdmin = sd[-nmin], sd[0]
umax = sdmax if umax is None else min(umax, sdmax)
umin = sdmin if umin is None else max(umin, sdmin)
if nu is None:
nu = min(n - nmin, 100)
u = linspace(umin, umax, nu)
nu = len(u)
di = np.zeros(nu)
d = arr(data)
mint = int(min(t1)) #; % mint should be 0.
maxt = int(max(t1))
M = maxt - mint + 1;
occ = np.zeros(M);
for ix, tresh in enumerate(u.tolist()):
excess = (d > tresh)
lambda_ = excess.sum() / M
for block in range(M):
occ[block] = sum(excess[t1 == block])
di[ix] = occ.var() / lambda_
p = 1 - alpha
diUp = _invchi2(1 - alpha / 2, M - 1) / (M - 1)
diLo = _invchi2(alpha / 2, M - 1) / (M - 1)
# Find appropriate threshold
k1, = np.where((diLo < di) & (di < diUp))
if len(k1) > 0:
ok_u = u[k1]
b_di = (di[k1].mean() < di[k1])
k = b_di.argmax()
b_u = ok_u[k]
else:
b_u = ok_u = None
CItxt = '%d%s CI' % (100 * p, '%')
titleTxt = 'Dispersion Index plot';
res = WafoData(di, u, title=titleTxt, labx='Threshold', laby='Dispersion Index')
#'caption',CItxt);
res.workspace = dict(umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha)
res.children = [WafoData(vstack([diLo * ones(nu), diUp * ones(nu)]).T, u, xlab='Threshold', title=CItxt)]
res.children_args = ['--r']
if plotflag:
res.plot(di)
return res, b_u, ok_u
def decluster(data, t=None, thresh=None, tmin=1):
'''
Return declustered peaks over threshold values
Parameters
----------
data, t : array-like
data-values and sampling-times, respectively.
thresh : real scalar
minimum threshold for levels in data.
tmin : real scalar
minimum distance to another peak [same unit as t] (default 1)
Returns
-------
ev, te : ndarray
extreme values and its corresponding sampling times, respectively, i.e.,
all data > thresh which are at least tmin distance apart.
Example
-------
>>> import pylab
>>> import wafo.data
>>> from wafo.misc import findtc
>>> x = wafo.data.sea()
>>> t, data = x[:400,:].T
>>> itc, iv = findtc(data,0,'dw')
>>> ytc, ttc = data[itc], t[itc]
>>> ymin = 2*data.std()
>>> tmin = 10 # sec
>>> [ye, te] = decluster(ytc,ttc, ymin,tmin);
>>> h = pylab.plot(t,data,ttc,ytc,'ro',t,zeros(len(t)),':',te,ye,'k.')
See also
--------
fitgenpar, findpot, extremalidx
'''
if t is None:
t = np.arange(len(data))
i = findpot(data, t, thresh, tmin)
return data[i], t[i]
def findpot(data, t=None, thresh=None, tmin=1):
'''
Retrun indices to Peaks over threshold values
Parameters
----------
data, t : array-like
data-values and sampling-times, respectively.
thresh : real scalar
minimum threshold for levels in data.
tmin : real scalar
minimum distance to another peak [same unit as t] (default 1)
Returns
-------
Ie : ndarray
indices to extreme values, i.e., all data > tresh which are at least
tmin distance apart.
Example
-------
>>> import pylab
>>> import wafo.data
>>> from wafo.misc import findtc
>>> x = wafo.data.sea()
>>> t, data = x.T
>>> itc, iv = findtc(data,0,'dw')
>>> ytc, ttc = data[itc], t[itc]
>>> ymin = 2*data.std()
>>> tmin = 10 # sec
>>> I = findpot(data, t, ymin, tmin)
>>> yp, tp = data[I], t[I]
>>> Ie = findpot(yp, tp, ymin,tmin)
>>> ye, te = yp[Ie], tp[Ie]
>>> h = pylab.plot(t,data,ttc,ytc,'ro',t,zeros(len(t)),':',te, ye,'k.',tp,yp,'+')
See also
--------
fitgenpar, decluster, extremalidx
'''
Data = arr(data)
if t is None:
ti = np.arange(len(Data))
else:
ti = arr(t)
Ie, = where(Data > thresh);
Ye = Data[Ie]
Te = ti[Ie]
if len(Ye) <= 1:
return Ie
dT = np.diff(Te)
notSorted = np.any(dT < 0);
if notSorted:
I = np.argsort(Te)
Te = Te[I]
Ie = Ie[I]
Ye = Ye[I]
dT = np.diff(Te)
isTooSmall = (dT <= tmin)
if np.any(isTooSmall):
isTooClose = np.hstack((isTooSmall[0], isTooSmall[:-1] | isTooSmall[1:], isTooSmall[-1]))
#Find opening (NO) and closing (NC) index for data beeing to close:
iy = findextrema(np.hstack([0, 0, isTooSmall, 0]))
NO = iy[::2] - 1
NC = iy[1::2]
for no, nc in zip(NO, NC):
iz = slice(no, nc)
iOK = _find_ok_peaks(Ye[iz], Te[iz], tmin)
if len(iOK):
isTooClose[no + iOK] = 0
# Remove data which is too close to other data.
if isTooClose.any():
#len(tooClose)>0:
iOK, = where(1 - isTooClose)
Ie = Ie[iOK]
return Ie
def _find_ok_peaks(Ye, Te, Tmin):
'''
Return indices to the largest maxima that are at least Tmin
distance apart.
'''
Ny = len(Ye)
I = np.argsort(-Ye) # sort in descending order
Te1 = Te[I]
oOrder = zeros(Ny, dtype=int)
oOrder[I] = range(Ny) #indices to the variables original location
isTooClose = zeros(Ny, dtype=bool)
pool = zeros((Ny, 2))
T_range = np.hstack([-Tmin, Tmin])
K = 0
for i, ti in enumerate(Te1):
isTooClose[i] = np.any((pool[:K, 0] <= ti) & (ti <= pool[:K, 1]))
if not isTooClose[i]:
pool[K] = ti + T_range
K += 1
iOK, = where(1 - isTooClose[oOrder])
return iOK
def declustering_time(t):
'''
Returns minimum distance between clusters.
Parameters
----------
t : array-like
sampling times for data.
Returns
-------
tc : real scalar
minimum distance between clusters.
Example
-------
>>> import wafo.data
>>> x = wafo.data.sea()
>>> t, data = x[:400,:].T
>>> Ie = findpot(data,t,0,5);
>>> tc = declustering_time(Ie)
>>> tc
21
'''
t0 = arr(t)
nt = len(t0)
if nt<2:
return arr([])
ti = interexceedance_times(t0)
ei = extremal_idx(ti)
if ei==1:
tc = ti.min()
else:
i = int(np.floor(nt*ei))
sti = -np.sort(-ti)
tc = sti[min(i, nt-2)] #% declustering time
return tc
def interexceedance_times(t):
'''
Returns interexceedance times of data
Parameters
----------
t : array-like
sampling times for data.
Returns
-------
ti : ndarray
interexceedance times
Example
-------
>>> t = [1,2,5,10]
>>> interexceedance_times(t)
array([1, 3, 5])
'''
return np.diff(np.sort(t))
def extremal_idx(ti):
'''
Returns Extremal Index measuring the dependence of data
Parameters
----------
ti : array-like
interexceedance times for data.
Returns
-------
ei : real scalar
Extremal index.
Notes
-----
The Extremal Index (EI) is one if the data are independent and less than
one if there are some dependence. The extremal index can also be intepreted
as the reciprocal of the mean cluster size.
Example
-------
>>> import wafo.data
>>> x = wafo.data.sea()
>>> t, data = x[:400,:].T
>>> Ie = findpot(data,t,0,5);
>>> ti = interexceedance_times(Ie)
>>> ei = extremal_idx(ti)
>>> ei
1
See also
--------
reslife, fitgenparrange, disprsnidx, findpot, decluster
Reference
---------
Christopher A. T. Ferro, Johan Segers (2003)
Inference for clusters of extreme values
Journal of the Royal Statistical society: Series B (Statistical Methodology) 54 (2), 545-556
doi:10.1111/1467-9868.00401
'''
t = arr(ti)
tmax = t.max()
if tmax<=1:
ei = 0
elif tmax<=2:
ei = min(1, 2*t.mean()**2/((t**2).mean()))
else:
ei = min(1, 2*np.mean(t-1)**2/np.mean((t-1)*(t-2)))
return ei
def _test_dispersion_idx():
import wafo.data
xn = wafo.data.sea()
t, data = xn.T
Ie = findpot(data,t,0,5);
di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100)
di.plot() # a threshold around 1 seems appropriate.
di.show()
pass
def _test_findpot():
import pylab
import wafo.data
from wafo.misc import findtc
x = wafo.data.sea()
t, data = x[:, :].T
itc, iv = findtc(data, 0, 'dw')
ytc, ttc = data[itc], t[itc]
ymin = 2 * data.std()
tmin = 10 # sec
I = findpot(data, t, ymin, tmin)
yp, tp = data[I], t[I]
Ie = findpot(yp, tp, ymin, tmin)
ye, te = yp[Ie], tp[Ie]
h = pylab.plot(t, data, ttc,ytc,'ro', t, zeros(len(t)), ':', te, ye, 'kx', tp, yp, '+')
pylab.show() #
pass
def _test_reslife():
import wafo
R = wafo.stats.genpareto.rvs(0.1, 2, 2, size=100)
mrl = reslife(R, nu=20)
mrl.plot()
def main():
#_test_dispersion_idx()
import doctest
doctest.testmod()
if __name__ == '__main__':
main()

@ -400,15 +400,15 @@ class rv_frozen(object):
def stats(self, moments='mv'):
''' Some statistics of the given RV'''
kwds = dict(moments=moments)
return self.dist.stats(*self.par, **kwds)
return self.dist.stats(*self.par)
def median(self):
return self.dist.median(*self.par, **self.kwds)
return self.dist.median(*self.par)
def mean(self):
return self.dist.mean(*self.par,**self.kwds)
return self.dist.mean(*self.par)
def var(self):
return self.dist.var(*self.par, **self.kwds)
return self.dist.var(*self.par)
def std(self):
return self.dist.std(*self.par, **self.kwds)
return self.dist.std(*self.par)
def moment(self, n):
par1 = self.par[:self.dist.numargs]
return self.dist.moment(n, *par1)
@ -418,7 +418,7 @@ class rv_frozen(object):
'''Probability mass function at k of the given RV'''
return self.dist.pmf(k, *self.par)
def interval(self,alpha):
return self.dist.interval(alpha, *self.par, **self.kwds)
return self.dist.interval(alpha, *self.par)
# Frozen RV class
@ -1982,10 +1982,8 @@ class rv_continuous(rv_generic):
def link(self, x, logSF, theta, i):
''' Return dist. par. no. i as function of quantile (x) and log survival probability (sf)
Assumptions:
------------
theta is list containing all parameters including location and scale.
where
theta is the list containing all parameters including location and scale.
'''
raise ValueError('Link function not implemented for the %s distribution' % self.name)
return None
@ -2232,25 +2230,6 @@ class rv_continuous(rv_generic):
'''
return FitDistribution(self, data, *args, **kwds)
# loc0, scale0, method = map(kwds.get, ['loc', 'scale','method'],[none, none,'ml'])
# args, loc0, scale0 = self.fix_loc_scale(args, loc0, scale0)
# Narg = len(args)
# if Narg != self.numargs:
# if Narg > self.numargs:
# raise ValueError, "Too many input arguments."
# else:
# args += (1.0,)*(self.numargs-Narg)
# # location and scale are at the end
# x0 = args + (loc0, scale0)
# if method.lower()[:].startswith('mps'):
# data.sort()
# fitfun = self.nlogps
# else:
# fitfun = self.nnlf
#
# return optimize.fmin(fitfun,x0,args=(ravel(data),),disp=0)
def fit_loc_scale(self, data, *args):
"""
Estimate loc and scale parameters from data using 1st and 2nd moments
@ -3387,6 +3366,20 @@ class genpareto_gen(rv_continuous):
#vals = 1.0/c * (pow(1-q, -c)-1)
#return vals
def _fitstart(self, data):
d = arr(data)
loc = d.min()-0.01*d.std()
#moments estimator
d1 = d-loc
m = d1.mean()
s = d1.std()
shape = ((m/s)**2 - 1)/2
scale = m*((m/s)**2+1)/2
return shape, loc, scale
def hessian_nnlf(self, theta, x, eps=None):
try:
loc = theta[-2]
@ -6876,6 +6869,10 @@ Skellam distribution
def main():
import matplotlib
matplotlib.interactive(True)
R = norm.rvs(size=100)
phat = norm.fit(R)
phat = genpareto.fit(R[R>0.7],f0=0.1, floc=0.7)
#nbinom(10, 0.75).rvs(3)
t = bernoulli(0.75).rvs(3)
x = np.r_[5, 10]

@ -10,21 +10,18 @@ from __future__ import division
from wafo.plotbackend import plotbackend
from wafo.misc import ecross, findcross
from scipy.misc.ppimport import ppimport
#from scipy.misc.ppimport import ppimport
import numdifftools
from scipy import special
from scipy.linalg import pinv2
from scipy import optimize
from numpy import alltrue, arange, \
ravel, ones, sum, \
zeros, log, sqrt, exp
from numpy import atleast_1d, any, asarray, nan, inf, pi, reshape, repeat, product, ndarray
import numpy
import numpy as np
from numpy import alltrue, arange, ravel, ones, sum, zeros, log, sqrt, exp
from numpy import (atleast_1d, any, asarray, nan, pi, reshape, repeat,
product, ndarray, isfinite)
from numpy import flatnonzero as nonzero
@ -33,11 +30,8 @@ __all__ = [
]
floatinfo = np.finfo(float)
#arr = atleast_1d
arr = asarray
all = alltrue
def chi2isf(p, df):
@ -123,13 +117,13 @@ class rv_frozen(object):
kwds = dict(moments=moments)
return self.dist.stats(*self.par, **kwds)
def median(self):
return self.dist.median(*self.par, **self.kwds)
return self.dist.median(*self.par)
def mean(self):
return self.dist.mean(*self.par,**self.kwds)
return self.dist.mean(*self.par)
def var(self):
return self.dist.var(*self.par, **self.kwds)
return self.dist.var(*self.par)
def std(self):
return self.dist.std(*self.par, **self.kwds)
return self.dist.std(*self.par)
def moment(self, n):
par1 = self.par[:self.dist.numargs]
return self.dist.moment(n, *par1)
@ -139,9 +133,7 @@ class rv_frozen(object):
'''Probability mass function at k of the given RV'''
return self.dist.pmf(k, *self.par)
def interval(self,alpha):
return self.dist.interval(alpha, *self.par, **self.kwds)
return self.dist.interval(alpha, *self.par)
# internal class to profile parameters of a given distribution
@ -261,9 +253,9 @@ class Profile(object):
self.i_free = nonzero(isfree)
self.Lmax = Lmax
self.alpha_Lrange = 0.5 * chi2isf(self.alpha, 1) #_WAFODIST.chi2.isf(self.alpha, 1)
self.alpha_Lrange = 0.5 * chi2isf(self.alpha, 1)
self.alpha_cross_level = Lmax - self.alpha_Lrange
lowLevel = self.alpha_cross_level - self.alpha_Lrange / 7.0
#lowLevel = self.alpha_cross_level - self.alpha_Lrange / 7.0
## Check that par are actually at the optimum
phatv = fit_dist.par.copy()
@ -327,7 +319,7 @@ class Profile(object):
cond = self.data == -numpy.inf
if any(cond):
ind, = cond.nonzero()
self.data.put(ind, numpy.finfo(float).min / 2.0)
self.data.put(ind, floatinfo.min / 2.0)
ind1 = numpy.where(ind == 0, ind, ind - 1)
cl = self.alpha_cross_level - self.alpha_Lrange / 2.0
t0 = ecross(self.args, self.data, ind1, cl)
@ -455,7 +447,8 @@ class FitDistribution(rv_frozen):
RV.profile() - Return Profile Log- likelihood or Product Spacing-function.
Member variables:
Member variables
----------------
data - data used in fitting
alpha - confidence coefficient
method - method used
@ -476,85 +469,143 @@ class FitDistribution(rv_frozen):
self.dist = dist
numargs = dist.numargs
self.method, self.alpha, self.par_fix, self.search, self.copydata = map(kwds.get, ['method', 'alpha', 'par_fix', 'search', 'copydata'], ['ml', 0.05, None, True, True])
self.data = ravel(data)
if self.copydata:
self.data = self.data.copy()
self.data.sort()
self.method=self.alpha=self.par_fix=self.search=self.copydata=None
m_variables = ['method', 'alpha', 'par_fix', 'search', 'copydata']
m_defaults = ['ml', 0.05, None, True, True]
for (name, val) in zip(m_variables,m_defaults):
setattr(self, name, kwds.get(name,val))
#self.method, self.alpha, self.par_fix, self.search, self.copydata = map(kwds.get, m_variables, m_defaults)
if self.method.lower()[:].startswith('mps'):
self._fitfun = dist.nlogps
else:
self._fitfun = dist.nnlf
allfixed = False
isfinite = numpy.isfinite
somefixed = (self.par_fix != None) and any(isfinite(self.par_fix))
self.data = ravel(data)
if self.copydata:
self.data = self.data.copy()
self.data.sort()
par, fixedn = self._fit(*args, **kwds)
self.par = arr(par)
somefixed = len(fixedn)>0
if somefixed:
fitfun = self._fxfitfun
self.par_fix = tuple(self.par_fix)
allfixed = all(isfinite(self.par_fix))
self.par = atleast_1d(self.par_fix)
self.i_notfixed = nonzero(1 - isfinite(self.par))
self.i_fixed = nonzero(isfinite(self.par))
if len(self.par) != numargs + 2:
raise ValueError, "Wrong number of input arguments."
if len(args) != len(self.i_notfixed):
raise ValueError("Length of args must equal number of non-fixed parameters given in par_fix! (%d) " % len(self.i_notfixed))
x0 = atleast_1d(args)
else:
fitfun = self.fitfun
loc0, scale0 = map(kwds.get, ['loc', 'scale'])
args, loc0, scale0 = dist.fix_loc_scale(args, loc0, scale0)
Narg = len(args)
if Narg != numargs:
if Narg > numargs:
raise ValueError, "Too many input arguments."
else:
args += (1.0,)*(numargs - Narg)
# location and scale are at the end
x0 = args + (loc0, scale0)
x0 = atleast_1d(x0)
numpar = len(x0)
if self.search and not allfixed:
#args=(self.data,),
par = optimize.fmin(fitfun, x0, disp=0)
if not somefixed:
self.par = par
elif (not allfixed) and somefixed:
self.par[self.i_notfixed] = x0
else:
self.par = x0
self.par_fix = [nan,]*len(self.par)
for i in fixedn:
self.par_fix[i] = self.par[i]
self.i_notfixed = nonzero(1 - isfinite(self.par_fix))
self.i_fixed = nonzero(isfinite(self.par_fix))
np = numargs + 2
numpar = numargs + 2
self.par_cov = zeros((numpar, numpar))
self._compute_cov()
# Set confidence interval for parameters
pvar = numpy.diag(self.par_cov)
zcrit = -norm_ppf(self.alpha / 2.0)
self.par_lower = self.par - zcrit * sqrt(pvar)
self.par_upper = self.par + zcrit * sqrt(pvar)
self.par_upper = None
self.par_lower = None
self.par_cov = zeros((np, np))
self.LLmax = -dist.nnlf(self.par, self.data)
self.LPSmax = -dist.nlogps(self.par, self.data)
self.pvalue = self._pvalue(self.par, self.data, unknown_numpar=numpar)
H = numpy.asmatrix(self._hessian_nnlf(self.par, self.data))
def _reduce_func(self, args, kwds):
args = list(args)
Nargs = len(args) - 2
fixedn = []
index = range(Nargs) + [-2, -1]
names = ['f%d' % n for n in range(Nargs)] + ['floc', 'fscale']
x0 = args[:]
for n, key in zip(index, names):
if kwds.has_key(key):
fixedn.append(n)
args[n] = kwds[key]
del x0[n]
fitfun = self._fitfun
if len(fixedn) == 0:
func = fitfun
restore = None
else:
if len(fixedn) == len(index):
raise ValueError, "All parameters fixed. There is nothing to optimize."
def restore(args, theta):
# Replace with theta for all numbers not in fixedn
# This allows the non-fixed values to vary, but
# we still call self.nnlf with all parameters.
i = 0
for n in range(Nargs):
if n not in fixedn:
args[n] = theta[i]
i += 1
return args
def func(theta, x):
newtheta = restore(args[:], theta)
return fitfun(newtheta, x)
return x0, func, restore, args, fixedn
def _fit(self, *args, **kwds):
dist = self.dist
data = self.data
Narg = len(args)
if Narg > dist.numargs:
raise ValueError, "Too many input arguments."
start = [None]*2
if (Narg < dist.numargs) or not (kwds.has_key('loc') and
kwds.has_key('scale')):
start = dist._fitstart(data) # get distribution specific starting locations
args += start[Narg:-2]
loc = kwds.get('loc', start[-2])
scale = kwds.get('scale', start[-1])
args += (loc, scale)
x0, func, restore, args, fixedn = self._reduce_func(args, kwds)
if self.search:
optimizer = kwds.get('optimizer', optimize.fmin)
# convert string to function in scipy.optimize
if not callable(optimizer) and isinstance(optimizer, (str, unicode)):
if not optimizer.startswith('fmin_'):
optimizer = "fmin_"+optimizer
if optimizer == 'fmin_':
optimizer = 'fmin'
try:
optimizer = getattr(optimize, optimizer)
except AttributeError:
raise ValueError, "%s is not a valid optimizer" % optimizer
vals = optimizer(func,x0,args=(ravel(data),),disp=0)
vals = tuple(vals)
else:
vals = tuple(x0)
if restore is not None:
vals = restore(args, vals)
return vals, fixedn
def _compute_cov(self):
'''Compute covariance
'''
somefixed = (self.par_fix != None) and any(isfinite(self.par_fix))
H = numpy.asmatrix(self.dist.hessian_nnlf(self.par, self.data))
self.H = H
try:
if somefixed:
allfixed = all(isfinite(self.par_fix))
if allfixed:
pass
elif somefixed:
self.par_cov[:,:]=0
else:
pcov = -pinv2(H[self.i_notfixed, :][..., self.i_notfixed])
for row, ix in enumerate(list(self.i_notfixed)):
self.par_cov[ix, self.i_notfixed] = pcov[row, :]
else:
self.par_cov = -pinv2(H)
except:
self.par_cov[:, :] = nan
pvar = numpy.diag(self.par_cov)
zcrit = -norm_ppf(self.alpha / 2.0)#_WAFODIST.norm.ppf(self.alpha / 2.0)
self.par_lower = self.par - zcrit * sqrt(pvar)
self.par_upper = self.par + zcrit * sqrt(pvar)
def fitfun(self, phat):
return self._fitfun(phat, self.data)
@ -562,7 +613,6 @@ class FitDistribution(rv_frozen):
self.par[self.i_notfixed] = phat10
return self._fitfun(self.par, self.data)
def profile(self, **kwds):
''' Profile Log- likelihood or Log Product Spacing- function,
which can be used for constructing confidence interval for
@ -812,65 +862,7 @@ class FitDistribution(rv_frozen):
return pvalue
def _hessian_nnlf(self, theta, data, eps=None):
''' approximate hessian of nnlf where theta are the parameters (including loc and scale)
'''
#Nd = len(x)
np = len(theta)
# pab 07.01.2001: Always choose the stepsize h so that
# it is an exactly representable number.
# This is important when calculating numerical derivatives and is
# accomplished by the following.
if eps == None:
eps = (floatinfo.machar.eps) ** 0.4
#xmin = floatinfo.machar.xmin
#myfun = lambda y: max(y,100.0*log(xmin)) #% trick to avoid log of zero
delta = (eps + 2.0) - 2.0
delta2 = delta ** 2.0
# % Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with
# % 1/(d^2 L(theta|x)/dtheta^2)
# % using central differences
dist = self.dist
LL = dist.nnlf(theta, data)
H = zeros((np, np)) #%% Hessian matrix
theta = tuple(theta)
for ix in xrange(np):
sparam = list(theta)
sparam[ix] = theta[ix] + delta
fp = dist.nnlf(sparam, data)
#fp = sum(myfun(x))
sparam[ix] = theta[ix] - delta
fm = dist.nnlf(sparam, data)
#fm = sum(myfun(x))
H[ix, ix] = (fp - 2 * LL + fm) / delta2
for iy in range(ix + 1, np):
sparam[ix] = theta[ix] + delta
sparam[iy] = theta[iy] + delta
fpp = dist.nnlf(sparam, data)
#fpp = sum(myfun(x))
sparam[iy] = theta[iy] - delta
fpm = dist.nnlf(sparam, data)
#fpm = sum(myfun(x))
sparam[ix] = theta[ix] - delta
fmm = dist.nnlf(sparam, data)
#fmm = sum(myfun(x));
sparam[iy] = theta[iy] + delta
fmp = dist.nnlf(sparam, data)
#fmp = sum(myfun(x))
H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2)
H[iy, ix] = H[ix, iy]
sparam[iy] = theta[iy];
# invert the Hessian matrix (i.e. invert the observed information number)
#pcov = -pinv(H);
return - H
def main():
_WAFODIST = ppimport('wafo.stats.distributions')

@ -52,13 +52,13 @@ class WafoData(object):
>>> h = d2.plot()
Plot with confidence interval
d3 = wdata(sin(x),x)
d3 = set(d3,'dataCI',[sin(x(:))*0.9 sin(x(:))*1.2])
plot(d3)
>>> d3 = WafoData(np.sin(x),x)
>>> d3.children = [WafoData(np.vstack([np.sin(x)*0.9, np.sin(x)*1.2]).T,x)]
>>> d3.children_args=[':r']
>>> h = d3.plot()
See also
--------
wdata/plot,
specdata,
covdata
'''
@ -68,6 +68,7 @@ class WafoData(object):
self.date = now()
self.plotter = None
self.children = None
self.children_args = []
self.labels = AxisLabels(**kwds)
self.setplotter()
@ -76,8 +77,9 @@ class WafoData(object):
if self.children != None:
plotbackend.hold('on')
tmp = []
child_args = args + tuple(self.children_args)
for child in self.children:
tmp1 = child.plot(*args, **kwds)
tmp1 = child.plot(*child_args, **kwds)
if tmp1 != None:
tmp.append(tmp1)
if len(tmp) == 0:

Loading…
Cancel
Save