Added functions to stats.core.py

-reslife -extremal_idx -++ Improved estimation.py
15 years ago · df05bec641
parent a490e23e2f
commit df05bec641
5 changed files with 828 additions and 234 deletions
--- a/pywafo/src/wafo/stats/init.py
+++ b/pywafo/src/wafo/stats/init.py
@ -7,7 +7,3 @@ Statistics package in WAFO Toolbox.
 from scipy.stats import *
 from wafo.stats.core import *
 from wafo.stats.distributions import *
-#from wafo.spectrum.core import SpecData1D
-#import wafo.spectrum.models
-#import wafo.spectrum.dispersion_relation
-#from wafo.data_structures import SpecData1D
--- a/pywafo/src/wafo/stats/core.py
+++ b/pywafo/src/wafo/stats/core.py
@ -1,10 +1,34 @@
 from __future__ import division
+import warnings
 from wafo.wafodata import WafoData
+from wafo.misc import findextrema
+from scipy import special
 import numpy as np
 from numpy import inf
-from numpy import atleast_1d
-from numpy import arange, floor
-__all__ = ['edf']
+from numpy import atleast_1d, nan, ndarray, sqrt, vstack, ones, where, zeros
+from numpy import arange, floor, linspace, asarray, reshape, repeat, product
+
+
+__all__ = ['edf', 'edfcnd']
+
+arr = asarray
+
+def valarray(shape, value=nan, typecode=None):
+    """Return an array of all value.
+    """
+    out = reshape(repeat([value], product(shape, axis=0), axis=0), shape)
+    if typecode is not None:
+        out = out.astype(typecode)
+    if not isinstance(out, ndarray):
+        out = arr(out)
+    return out 
+
+def _invt(q, df):
+    return special.stdtrit(df, q)
+
+def _invchi2(q, df):
+    return special.chdtri(df, q)
+

 def edf(x, method=2):
    ''' 
@ -25,7 +49,7 @@ def edf(x, method=2):
    >>> x = np.linspace(0,6,200)
    >>> R = ws.rayleigh.rvs(scale=2,size=100)
    >>> F = ws.edf(R)
-    >>> F.plot()
+    >>> h = F.plot()
      
     See also edf, pdfplot, cumtrapz
    '''
@ -64,10 +88,10 @@ def edfcnd(x, c=None, method=2):
    >>> import wafo.stats as ws
    >>> x = np.linspace(0,6,200)
    >>> R = ws.rayleigh.rvs(scale=2,size=100)
-    >>> Fc = ws.edfcd(R, 1)
-    >>> Fc.plot()
+    >>> Fc = ws.edfcnd(R, 1)
+    >>> hc = Fc.plot()
    >>> F = ws.edf(R)
-    >>> F.plot()
+    >>> h = F.plot()
    
     See also edf, pdfplot, cumtrapz
    '''
@ -84,3 +108,586 @@ def edfcnd(x, c=None, method=2):
        F.labels.ylab = 'F(x| X>=%g)' % c
    
    return F
+
+
+def reslife(data, u=None, umin=None, umax=None, nu=None, nmin=3, alpha=0.05, plotflag=False):
+    ''' 
+    Return Mean Residual Life, i.e., mean excesses vs thresholds
+    
+    Parameters
+    ---------
+    data : array_like
+        vector of data of length N. 
+    u :  array-like
+        threshold values (default linspace(umin, umax, nu))
+    umin, umax : real scalars
+        Minimum and maximum threshold, respectively (default min(data), max(data)).
+    nu : scalar integer
+        number of threshold values (default min(N-nmin,100))
+    nmin : scalar integer
+        Minimum number of extremes to include. (Default 3).
+    alpha : real scalar
+        Confidence coefficient (default 0.05)        
+    plotflag: bool
+        
+    
+    Returns
+    -------
+    mrl : WafoData object
+        Mean residual life values, i.e., mean excesses over thresholds, u.
+    
+    Notes
+    -----
+    RESLIFE estimate mean excesses over thresholds. The purpose of MRL is
+    to determine the threshold where the upper tail of the data can be 
+    approximated with the generalized Pareto distribution (GPD). The GPD is
+    appropriate for the tail, if the MRL is a linear function of the 
+    threshold, u. Theoretically in the GPD model 
+    
+        E(X-u0|X>u0) = s0/(1+k)
+        E(X-u |X>u)  = s/(1+k) = (s0 -k*u)/(1+k)   for u>u0
+    
+    where k,s is the shape and scale parameter, respectively.
+    s0 = scale parameter for threshold u0<u.
+    
+    Example
+    -------
+    >>> import wafo
+    >>> R = wafo.stats.genpareto.rvs(0.1,2,2,size=100)
+    >>> mrl = reslife(R,nu=20)
+    >>> h = mrl.plot() 
+      
+    See also
+    ---------
+    genpareto
+    fitgenparrange, disprsnidx
+    '''
+    if u is None:
+        sd = np.sort(data)
+        n = len(data)
+        
+        nmin = max(nmin, 0)
+        if 2 * nmin > n:
+            warnings.warn('nmin possibly too large!')
+        
+        sdmax, sdmin = sd[-nmin], sd[0]
+        umax = sdmax if umax is None else min(umax, sdmax)
+        umin = sdmin if umin is None else max(umin, sdmin)
+        
+        if nu is None:
+            nu = min(n - nmin, 100)
+        
+        u = linspace(umin, umax, nu)
+    
+    
+    nu = len(u)
+    
+    #mrl1 = valarray(nu) 
+    #srl = valarray(nu)
+    #num = valarray(nu)
+    
+    mean_and_std = lambda data1 : (data1.mean(), data1.std(), data1.size)
+    dat = arr(data)
+    tmp = arr([mean_and_std(dat[dat > tresh] - tresh) for tresh in u.tolist()])
+    
+    mrl, srl, num = tmp.T
+    p = 1 - alpha
+    alpha2 = alpha / 2
+    
+    # Approximate P% confidence interval
+    #%Za = -invnorm(alpha2);   % known mean
+    Za = -_invt(alpha2, num - 1) # unknown mean
+    mrlu = mrl + Za * srl / sqrt(num)
+    mrll = mrl - Za * srl / sqrt(num)
+    
+    #options.CI = [mrll,mrlu];
+    #options.numdata = num;
+    titleTxt = 'Mean residual life with %d%s CI' % (100 * p, '%')
+    res = WafoData(mrl, u, xlab='Threshold', ylab='Mean Excess', title=titleTxt)
+    res.workspace = dict(numdata=num, umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha)
+    res.children = [WafoData(vstack([mrll, mrlu]).T, u, xlab='Threshold', title=titleTxt)]
+    res.children_args = [':r']
+    if plotflag:
+        res.plot()
+    return res
+
+def dispersion_idx(data, t=None, u=None, umin=None, umax=None, nu=None, nmin=10, tb=1,
+               alpha=0.05, plotflag=False):
+    '''Return Dispersion Index vs threshold
+    
+    Parameters
+    ----------
+    data, ti : array_like
+        data values and sampled times, respectively.
+    u :  array-like
+        threshold values (default linspace(umin, umax, nu))
+    umin, umax : real scalars
+        Minimum and maximum threshold, respectively (default min(data), max(data)).
+    nu : scalar integer
+        number of threshold values (default min(N-nmin,100))
+    nmin : scalar integer
+        Minimum number of extremes to include. (Default 10).
+    tb : Real scalar
+        Block period (same unit as the sampled times)  (default 1)
+    alpha : real scalar
+        Confidence coefficient (default 0.05)        
+    plotflag: bool
+    
+    Returns
+    -------
+    DI : WafoData object
+        Dispersion index
+    b_u : real scalar
+        threshold where the number of exceedances in a fixed period (Tb) is
+        consistent with a Poisson process.
+    ok_u : array-like
+        all thresholds where the number of exceedances in a fixed period (Tb) is
+        consistent with a Poisson process.
+    Notes
+    ------
+    DISPRSNIDX estimate the Dispersion Index (DI) as function of threshold. 
+    DI measures the homogenity of data and the purpose of DI is to determine 
+    the threshold where the number of exceedances in a fixed period (Tb) is
+    consistent with a Poisson process. For a Poisson process the DI is one. 
+    Thus the threshold should be so high that DI is not significantly
+    different from 1.
+
+    The Poisson hypothesis is not rejected if the estimated DI is between:
+
+    chi2(alpha/2, M-1)/(M-1)< DI < chi^2(1 - alpha/2, M-1 }/(M - 1)
+
+    where M is the total number of fixed periods/blocks -generally
+    the total number of years in the sample.
+    
+    Example
+    -------
+    >>> import wafo.data
+    >>> xn = wafo.data.sea()
+    >>> t, data = xn.T
+    >>> Ie = findpot(data,t,0,5);
+    >>> di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100)
+    >>> h = di.plot() # a threshold around 1 seems appropriate.
+    
+    vline(u)
+    
+    See also
+    --------
+    reslife, 
+    fitgenparrange, 
+    extremal_idx
+    
+    
+    References
+    ----------
+    Ribatet, M. A.,(2006),
+    A User's Guide to the POT Package (Version 1.0)
+    month = {August},
+    url = {http://cran.r-project.org/}
+    
+    Cunnane, C. (1979) Note on the poisson assumption in
+    partial duration series model. Water Resource Research, 15\bold{(2)}
+         :489--494.}
+    '''
+    
+# This program is free software; you can redistribute it and/or modify it under the terms of the GNU
+# General Public License as published by the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful, but without any warranty; without even
+# the implied warranty of merchantability or fitness for a particular purpose. See the GNU General Public
+# License for moredetails.
+# The GNU General Public License can be obtained from http://www.gnu.org/copyleft/gpl.html. You
+# can also obtain it by writing to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+#  MA 02111-1307, USA.
+    
+    
+    
+    n = len(data)
+    if t is None:
+        ti = arange(n)
+    else:
+        ti = arr(t) - min(t)
+    
+    t1 = np.empty(ti.shape,dtype=int)
+    t1[:] = np.floor(ti / tb)
+    
+    
+    if u is None:
+        sd = np.sort(data)
+       
+        
+        nmin = max(nmin, 0)
+        if 2 * nmin > n:
+            warnings.warn('nmin possibly too large!')
+        
+        sdmax, sdmin = sd[-nmin], sd[0]
+        umax = sdmax if umax is None else min(umax, sdmax)
+        umin = sdmin if umin is None else max(umin, sdmin)
+        
+        if nu is None:
+            nu = min(n - nmin, 100)
+        
+        u = linspace(umin, umax, nu)
+    
+   
+    
+    nu = len(u)
+   
+    di = np.zeros(nu)
+    
+    d = arr(data)
+    
+    mint = int(min(t1)) #; % mint should be 0.
+    maxt = int(max(t1))
+    M = maxt - mint + 1;
+    occ = np.zeros(M);
+    
+    for ix, tresh in enumerate(u.tolist()):
+        excess = (d > tresh)
+        lambda_ = excess.sum() / M
+        for block in  range(M):
+            occ[block] = sum(excess[t1 == block])
+      
+        di[ix] = occ.var() / lambda_
+    
+    p = 1 - alpha
+   
+    diUp = _invchi2(1 - alpha / 2, M - 1) / (M - 1)
+    diLo = _invchi2(alpha / 2, M - 1) / (M - 1)
+    
+    # Find appropriate threshold
+    k1, = np.where((diLo < di) & (di < diUp))
+    if len(k1) > 0:
+        ok_u = u[k1]
+        b_di = (di[k1].mean() < di[k1])
+        k = b_di.argmax()
+        b_u = ok_u[k]
+    else:
+        b_u = ok_u = None
+      
+    CItxt = '%d%s CI' % (100 * p, '%')
+    titleTxt = 'Dispersion Index plot';
+    
+    res = WafoData(di, u, title=titleTxt, labx='Threshold', laby='Dispersion Index')
+        #'caption',CItxt);
+    res.workspace = dict(umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha)
+    res.children = [WafoData(vstack([diLo * ones(nu), diUp * ones(nu)]).T, u, xlab='Threshold', title=CItxt)]
+    res.children_args = ['--r']
+    if plotflag:
+        res.plot(di)
+    return res, b_u, ok_u
+
+def decluster(data, t=None, thresh=None, tmin=1):   
+    '''
+    Return declustered peaks over threshold values
+    
+    Parameters
+    ----------
+    data, t : array-like     
+        data-values and sampling-times, respectively.
+    thresh : real scalar
+        minimum threshold for levels in data.
+    tmin : real scalar
+        minimum distance to another peak [same unit as t] (default 1)
+    
+    Returns
+    -------
+    ev, te : ndarray
+        extreme values and its corresponding sampling times, respectively, i.e., 
+        all data > thresh which are at least tmin distance apart.
+      
+    Example
+    -------
+    >>> import pylab
+    >>> import wafo.data
+    >>> from wafo.misc import findtc
+    >>> x  = wafo.data.sea()
+    >>> t, data = x[:400,:].T
+    >>> itc, iv = findtc(data,0,'dw')
+    >>> ytc, ttc = data[itc], t[itc]
+    >>> ymin = 2*data.std()
+    >>> tmin = 10 # sec 
+    >>> [ye, te] = decluster(ytc,ttc, ymin,tmin);
+    >>> h = pylab.plot(t,data,ttc,ytc,'ro',t,zeros(len(t)),':',te,ye,'k.')
+    
+    See also
+    --------
+    fitgenpar, findpot, extremalidx
+    '''
+    if t is None:
+        t = np.arange(len(data))
+    i = findpot(data, t, thresh, tmin)
+    return data[i], t[i]
+
+def findpot(data, t=None, thresh=None, tmin=1):
+    '''
+    Retrun indices to Peaks over threshold values
+    
+    Parameters
+    ----------
+    data, t : array-like     
+        data-values and sampling-times, respectively.
+    thresh : real scalar
+        minimum threshold for levels in data.
+    tmin : real scalar
+        minimum distance to another peak [same unit as t] (default 1)
+        
+    Returns
+    -------
+    Ie : ndarray
+        indices to extreme values, i.e., all data > tresh which are at least 
+        tmin distance apart.
+    
+    Example
+    -------
+    >>> import pylab
+    >>> import wafo.data
+    >>> from wafo.misc import findtc
+    >>> x  = wafo.data.sea()
+    >>> t, data = x.T
+    >>> itc, iv = findtc(data,0,'dw')
+    >>> ytc, ttc = data[itc], t[itc]
+    >>> ymin = 2*data.std()
+    >>> tmin = 10 # sec
+    >>> I = findpot(data, t, ymin, tmin)
+    >>> yp, tp = data[I], t[I]
+    >>> Ie = findpot(yp, tp, ymin,tmin)
+    >>> ye, te = yp[Ie], tp[Ie]
+    >>> h = pylab.plot(t,data,ttc,ytc,'ro',t,zeros(len(t)),':',te, ye,'k.',tp,yp,'+')
+    
+    See also
+    --------
+    fitgenpar, decluster, extremalidx
+    '''
+    Data = arr(data)
+    if t is None:
+        ti = np.arange(len(Data))
+    else:
+        ti = arr(t)
+        
+    Ie, = where(Data > thresh);
+    Ye = Data[Ie]
+    Te = ti[Ie]
+    if len(Ye) <= 1:
+        return Ie
+    
+    dT = np.diff(Te)
+    notSorted = np.any(dT < 0);
+    if notSorted:
+        I = np.argsort(Te)
+        Te = Te[I] 
+        Ie = Ie[I]
+        Ye = Ye[I]
+        dT = np.diff(Te)
+    
+    isTooSmall = (dT <= tmin)
+    
+    if np.any(isTooSmall):
+        isTooClose = np.hstack((isTooSmall[0], isTooSmall[:-1] | isTooSmall[1:], isTooSmall[-1]))
+     
+        #Find opening (NO) and closing (NC) index for data beeing to close:
+        iy = findextrema(np.hstack([0, 0, isTooSmall, 0]))
+    
+        NO = iy[::2] - 1 
+        NC = iy[1::2] 
+      
+        for no, nc in zip(NO, NC):
+            iz = slice(no, nc)
+            iOK = _find_ok_peaks(Ye[iz], Te[iz], tmin)
+            if len(iOK):
+                isTooClose[no + iOK] = 0
+        # Remove data which is too close to other data.        
+        if isTooClose.any():
+            #len(tooClose)>0:
+            iOK, = where(1 - isTooClose)
+            Ie = Ie[iOK]
+
+    return Ie  
+    
+    
+def _find_ok_peaks(Ye, Te, Tmin):
+    '''
+    Return indices to the largest maxima that are at least Tmin
+    distance apart.
+    '''
+    Ny = len(Ye)
+    
+    I = np.argsort(-Ye) #  sort in descending order
+    
+    Te1 = Te[I]
+    oOrder = zeros(Ny, dtype=int)
+    oOrder[I] = range(Ny) #indices to the variables original location
+    
+    isTooClose = zeros(Ny, dtype=bool)
+      
+    pool = zeros((Ny, 2))
+    T_range = np.hstack([-Tmin, Tmin])
+    K = 0
+    for i, ti in enumerate(Te1):
+        isTooClose[i] = np.any((pool[:K, 0] <= ti) & (ti <= pool[:K, 1]))
+        if not isTooClose[i]:
+            pool[K] = ti + T_range
+            K += 1
+      
+    iOK, = where(1 - isTooClose[oOrder])
+    return iOK
+
+
+    
+def declustering_time(t):
+    '''
+    Returns minimum distance between clusters.
+    
+    Parameters
+    ----------
+    t : array-like     
+        sampling times for data.
+
+    Returns
+    -------
+    tc : real scalar
+        minimum distance between clusters.
+    
+    Example
+    -------
+    >>> import wafo.data
+    >>> x  = wafo.data.sea()
+    >>> t, data = x[:400,:].T
+    >>> Ie = findpot(data,t,0,5);
+    >>> tc = declustering_time(Ie) 
+    >>> tc
+    21
+    
+    '''   
+    t0 = arr(t)
+    nt = len(t0)
+    if nt<2:
+        return arr([])
+    ti = interexceedance_times(t0)
+    ei = extremal_idx(ti)
+    if ei==1:
+        tc = ti.min()
+    else:
+        i = int(np.floor(nt*ei))
+        sti = -np.sort(-ti)
+        tc = sti[min(i, nt-2)] #% declustering time
+    return tc
+    
+    
+def interexceedance_times(t):
+    '''
+    Returns interexceedance times of data
+    
+    Parameters
+    ----------
+    t : array-like     
+        sampling times for data.
+    Returns
+    -------
+    ti : ndarray
+        interexceedance times
+        
+    Example
+    -------
+    >>> t = [1,2,5,10]
+    >>> interexceedance_times(t)
+    array([1, 3, 5])
+    
+    '''  
+    return np.diff(np.sort(t))  
+
+def extremal_idx(ti):
+    '''
+    Returns Extremal Index measuring the dependence of data
+    
+    Parameters
+    ----------
+    ti : array-like     
+        interexceedance times for data.
+
+    Returns
+    -------
+    ei : real scalar
+        Extremal index.
+   
+    Notes
+    ----- 
+    The Extremal Index (EI) is  one if the data are independent and less than 
+    one if there are some dependence. The extremal index can also be intepreted 
+    as the reciprocal of the mean cluster size.
+    
+    Example
+    -------
+    >>> import wafo.data
+    >>> x  = wafo.data.sea()
+    >>> t, data = x[:400,:].T
+    >>> Ie = findpot(data,t,0,5);
+    >>> ti = interexceedance_times(Ie)
+    >>> ei = extremal_idx(ti) 
+    >>> ei
+    1
+    
+    See also 
+    --------
+    reslife, fitgenparrange, disprsnidx, findpot, decluster
+    
+    
+    Reference
+    ---------
+    Christopher A. T. Ferro, Johan Segers (2003)
+    Inference for clusters of extreme values
+    Journal of the Royal Statistical society: Series B (Statistical Methodology) 54 (2), 545-556
+    doi:10.1111/1467-9868.00401
+    '''
+    t = arr(ti)
+    tmax = t.max() 
+    if tmax<=1:
+        ei = 0
+    elif tmax<=2:
+        ei = min(1, 2*t.mean()**2/((t**2).mean()))
+    else:
+        ei = min(1, 2*np.mean(t-1)**2/np.mean((t-1)*(t-2)))
+    return ei
+
+
+def _test_dispersion_idx():
+    import wafo.data
+    xn = wafo.data.sea()
+    t, data = xn.T
+    Ie = findpot(data,t,0,5);
+    di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100)
+    di.plot() # a threshold around 1 seems appropriate.
+    di.show()
+    pass
+
+def _test_findpot():
+    import pylab
+    import wafo.data
+    from wafo.misc import findtc
+    x = wafo.data.sea()
+    t, data = x[:, :].T
+    itc, iv = findtc(data, 0, 'dw')
+    ytc, ttc = data[itc], t[itc]
+    ymin = 2 * data.std()
+    tmin = 10 # sec
+    I = findpot(data, t, ymin, tmin)
+    yp, tp = data[I], t[I]
+    Ie = findpot(yp, tp, ymin, tmin)
+    ye, te = yp[Ie], tp[Ie]
+    h = pylab.plot(t, data, ttc,ytc,'ro', t, zeros(len(t)), ':', te, ye, 'kx', tp, yp, '+')
+    pylab.show() #
+    pass
+
+def _test_reslife():
+    import wafo
+    R = wafo.stats.genpareto.rvs(0.1, 2, 2, size=100)
+    mrl = reslife(R, nu=20)
+    mrl.plot()
+    
+def main():
+    #_test_dispersion_idx() 
+    import doctest
+    doctest.testmod()
+    
+    
+if __name__ == '__main__':
+    main()
--- a/pywafo/src/wafo/stats/distributions.py
+++ b/pywafo/src/wafo/stats/distributions.py
@ -400,15 +400,15 @@ class rv_frozen(object):
    def stats(self, moments='mv'):
        ''' Some statistics of the given RV'''
        kwds = dict(moments=moments)
-        return self.dist.stats(*self.par, **kwds)
+        return self.dist.stats(*self.par)
    def median(self):
-        return self.dist.median(*self.par, **self.kwds)
+        return self.dist.median(*self.par)
    def mean(self):
-        return self.dist.mean(*self.par,**self.kwds)
+        return self.dist.mean(*self.par)
    def var(self):
-        return self.dist.var(*self.par, **self.kwds)
+        return self.dist.var(*self.par)
    def std(self):
-        return self.dist.std(*self.par, **self.kwds)
+        return self.dist.std(*self.par)
    def moment(self, n):
        par1 = self.par[:self.dist.numargs]
        return self.dist.moment(n, *par1)
@ -418,7 +418,7 @@ class rv_frozen(object):
        '''Probability mass function at k of the given RV'''
        return self.dist.pmf(k, *self.par)
    def interval(self,alpha):
-        return self.dist.interval(alpha, *self.par, **self.kwds)
+        return self.dist.interval(alpha, *self.par)


 # Frozen RV class
@ -1982,10 +1982,8 @@ class rv_continuous(rv_generic):

    def link(self, x, logSF, theta, i):
        ''' Return dist. par. no. i as function of quantile (x) and log survival probability (sf)
-
-            Assumptions:
-            ------------
-            theta is list containing all parameters including location and scale.
+            where
+            theta is the list containing all parameters including location and scale.
        '''
        raise ValueError('Link function not implemented for the %s distribution' % self.name)
        return None
@ -2232,25 +2230,6 @@ class rv_continuous(rv_generic):
        '''
        return FitDistribution(self, data, *args, **kwds)
   
-#        loc0, scale0, method = map(kwds.get, ['loc', 'scale','method'],[none, none,'ml'])
-#        args, loc0, scale0 = self.fix_loc_scale(args, loc0, scale0)
-#        Narg = len(args)
-#        if Narg != self.numargs:
-#            if Narg > self.numargs:
-#                raise ValueError, "Too many input arguments."
-#            else:
-#                args += (1.0,)*(self.numargs-Narg)
-#        # location and scale are at the end
-#        x0 = args + (loc0, scale0)
-#        if method.lower()[:].startswith('mps'):
-#            data.sort()
-#            fitfun = self.nlogps
-#        else:
-#            fitfun = self.nnlf
-#
-#        return optimize.fmin(fitfun,x0,args=(ravel(data),),disp=0)
-
-   
    def fit_loc_scale(self, data, *args):
        """
        Estimate loc and scale parameters from data using 1st and 2nd moments
@ -3387,6 +3366,20 @@ class genpareto_gen(rv_continuous):

        #vals = 1.0/c * (pow(1-q, -c)-1)
        #return vals
+        
+    def _fitstart(self, data):
+        d = arr(data)
+        loc = d.min()-0.01*d.std()
+        #moments estimator
+        d1 = d-loc
+        m = d1.mean() 
+        s = d1.std()
+        
+        shape = ((m/s)**2 - 1)/2
+        scale = m*((m/s)**2+1)/2
+        return shape, loc, scale 
+
+
    def hessian_nnlf(self, theta, x, eps=None):
        try:
            loc = theta[-2]
@ -6876,6 +6869,10 @@ Skellam distribution
 def main():
    import matplotlib
    matplotlib.interactive(True)
+    R = norm.rvs(size=100)
+    phat = norm.fit(R)
+    phat = genpareto.fit(R[R>0.7],f0=0.1, floc=0.7)
+    
    #nbinom(10, 0.75).rvs(3)
    t = bernoulli(0.75).rvs(3)
    x = np.r_[5, 10]
--- a/pywafo/src/wafo/stats/estimation.py
+++ b/pywafo/src/wafo/stats/estimation.py
@ -10,21 +10,18 @@ from __future__ import division

 from wafo.plotbackend import plotbackend
 from wafo.misc import ecross, findcross
-from scipy.misc.ppimport import ppimport
+#from scipy.misc.ppimport import ppimport

 import numdifftools 
 from scipy import special
 from scipy.linalg import pinv2
-
-
 from scipy import optimize
-from numpy import alltrue, arange, \
-     ravel, ones, sum, \
-     zeros, log, sqrt, exp
-from numpy import atleast_1d, any, asarray, nan, inf, pi, reshape, repeat, product, ndarray
+
 import numpy
 import numpy as np
-
+from numpy import alltrue, arange, ravel, ones, sum, zeros, log, sqrt, exp
+from numpy import (atleast_1d, any, asarray, nan, pi, reshape, repeat, 
+                   product, ndarray, isfinite)
 from numpy import flatnonzero as nonzero


@ -33,11 +30,8 @@ __all__ = [
          ]

 floatinfo = np.finfo(float)
-
-
 #arr = atleast_1d
 arr = asarray
-
 all = alltrue

 def chi2isf(p, df):
@ -123,13 +117,13 @@ class rv_frozen(object):
        kwds = dict(moments=moments)
        return self.dist.stats(*self.par, **kwds)
    def median(self):
-        return self.dist.median(*self.par, **self.kwds)
+        return self.dist.median(*self.par)
    def mean(self):
-        return self.dist.mean(*self.par,**self.kwds)
+        return self.dist.mean(*self.par)
    def var(self):
-        return self.dist.var(*self.par, **self.kwds)
+        return self.dist.var(*self.par)
    def std(self):
-        return self.dist.std(*self.par, **self.kwds)
+        return self.dist.std(*self.par)
    def moment(self, n):
        par1 = self.par[:self.dist.numargs]
        return self.dist.moment(n, *par1)
@ -139,9 +133,7 @@ class rv_frozen(object):
        '''Probability mass function at k of the given RV'''
        return self.dist.pmf(k, *self.par)
    def interval(self,alpha):
-        return self.dist.interval(alpha, *self.par, **self.kwds)
-
-
+        return self.dist.interval(alpha, *self.par)


 # internal class to profile parameters of a given distribution
@ -261,9 +253,9 @@ class Profile(object):
        self.i_free = nonzero(isfree)

        self.Lmax = Lmax
-        self.alpha_Lrange = 0.5 * chi2isf(self.alpha, 1) #_WAFODIST.chi2.isf(self.alpha, 1)
+        self.alpha_Lrange = 0.5 * chi2isf(self.alpha, 1) 
        self.alpha_cross_level = Lmax - self.alpha_Lrange
-        lowLevel = self.alpha_cross_level - self.alpha_Lrange / 7.0
+        #lowLevel = self.alpha_cross_level - self.alpha_Lrange / 7.0

        ## Check that par are actually at the optimum
        phatv = fit_dist.par.copy()
@ -327,7 +319,7 @@ class Profile(object):
        cond = self.data == -numpy.inf
        if any(cond):
            ind, = cond.nonzero()
-            self.data.put(ind, numpy.finfo(float).min / 2.0)
+            self.data.put(ind, floatinfo.min / 2.0)
            ind1 = numpy.where(ind == 0, ind, ind - 1)
            cl = self.alpha_cross_level - self.alpha_Lrange / 2.0
            t0 = ecross(self.args, self.data, ind1, cl)
@ -455,7 +447,8 @@ class FitDistribution(rv_frozen):
    
        RV.profile()      - Return Profile Log- likelihood or Product Spacing-function.
    
-    Member variables:
+        Member variables
+        ----------------
        data - data used in fitting
        alpha - confidence coefficient
        method - method used
@ -476,85 +469,143 @@ class FitDistribution(rv_frozen):
        self.dist = dist
        numargs = dist.numargs
        
-        self.method, self.alpha, self.par_fix, self.search, self.copydata = map(kwds.get, ['method', 'alpha', 'par_fix', 'search', 'copydata'], ['ml', 0.05, None, True, True])
-        self.data = ravel(data)
-        if self.copydata:
-            self.data = self.data.copy()
-        self.data.sort()
+        self.method=self.alpha=self.par_fix=self.search=self.copydata=None
+        m_variables = ['method', 'alpha', 'par_fix', 'search', 'copydata']
+        m_defaults = ['ml', 0.05, None, True, True]
+        for (name, val) in zip(m_variables,m_defaults):
+            setattr(self, name, kwds.get(name,val))
+            
+        #self.method, self.alpha, self.par_fix, self.search, self.copydata = map(kwds.get, m_variables, m_defaults)
        if self.method.lower()[:].startswith('mps'):
            self._fitfun = dist.nlogps
        else:
            self._fitfun = dist.nnlf
        
-        allfixed = False
-        isfinite = numpy.isfinite
-        somefixed = (self.par_fix != None) and any(isfinite(self.par_fix))
+        self.data = ravel(data)
+        if self.copydata:
+            self.data = self.data.copy()
+        self.data.sort()
        
+        par, fixedn = self._fit(*args, **kwds)
+        self.par = arr(par)
+        somefixed = len(fixedn)>0 
        if somefixed:
-            fitfun = self._fxfitfun
-            self.par_fix = tuple(self.par_fix)
-            allfixed = all(isfinite(self.par_fix))
-            self.par = atleast_1d(self.par_fix)
-            self.i_notfixed = nonzero(1 - isfinite(self.par))
-            self.i_fixed = nonzero(isfinite(self.par))
-            if len(self.par) != numargs + 2:
-                raise ValueError, "Wrong number of input arguments."
-            if len(args) != len(self.i_notfixed):
-                raise ValueError("Length of args must equal number of non-fixed parameters given in par_fix! (%d) " % len(self.i_notfixed))
-            x0 = atleast_1d(args)
-        else:
-            fitfun = self.fitfun
-            loc0, scale0 = map(kwds.get, ['loc', 'scale'])
-            args, loc0, scale0 = dist.fix_loc_scale(args, loc0, scale0)
-            Narg = len(args)
-            if Narg != numargs:
-                if Narg > numargs:
-                    raise ValueError, "Too many input arguments."
-                else:
-                    args += (1.0,)*(numargs - Narg)
-            # location and scale are at the end
-            x0 = args + (loc0, scale0)
-            x0 = atleast_1d(x0)
-
-        numpar = len(x0)
-        if self.search and not allfixed:
-            #args=(self.data,),
-            par = optimize.fmin(fitfun, x0, disp=0)
-            if not somefixed:
-                self.par = par
-        elif  (not allfixed) and somefixed:
-            self.par[self.i_notfixed] = x0
-        else:
-            self.par = x0
+            self.par_fix = [nan,]*len(self.par)
+            for i in fixedn:
+                self.par_fix[i] = self.par[i]
+        
+            self.i_notfixed = nonzero(1 - isfinite(self.par_fix))
+            self.i_fixed = nonzero(isfinite(self.par_fix))

-        np = numargs + 2
+        numpar = numargs + 2
+        self.par_cov = zeros((numpar, numpar))
+        self._compute_cov()
+        
+        # Set confidence interval for parameters
+        pvar = numpy.diag(self.par_cov)
+        zcrit = -norm_ppf(self.alpha / 2.0)
+        self.par_lower = self.par - zcrit * sqrt(pvar)
+        self.par_upper = self.par + zcrit * sqrt(pvar)
        
-        self.par_upper = None
-        self.par_lower = None
-        self.par_cov = zeros((np, np))
        self.LLmax = -dist.nnlf(self.par, self.data)
        self.LPSmax = -dist.nlogps(self.par, self.data)
        self.pvalue = self._pvalue(self.par, self.data, unknown_numpar=numpar)
-        H = numpy.asmatrix(self._hessian_nnlf(self.par, self.data))
+    
+    def _reduce_func(self, args, kwds):
+        args = list(args)
+        Nargs = len(args) - 2
+        fixedn = []
+        index = range(Nargs) + [-2, -1]
+        names = ['f%d' % n for n in range(Nargs)] + ['floc', 'fscale']
+        x0 = args[:]
+        for n, key in zip(index, names):
+            if kwds.has_key(key):
+                fixedn.append(n)
+                args[n] = kwds[key]
+                del x0[n]
+                
+        fitfun = self._fitfun
+            
+        if len(fixedn) == 0:
+            func = fitfun
+            restore = None
+        else:
+            if len(fixedn) == len(index):
+                raise ValueError, "All parameters fixed. There is nothing to optimize."
+            def restore(args, theta):
+                # Replace with theta for all numbers not in fixedn
+                # This allows the non-fixed values to vary, but
+                #  we still call self.nnlf with all parameters.
+                i = 0
+                for n in range(Nargs):
+                    if n not in fixedn:
+                        args[n] = theta[i]
+                        i += 1
+                return args
+
+            def func(theta, x):
+                newtheta = restore(args[:], theta)
+                return fitfun(newtheta, x)
+
+        return x0, func, restore, args, fixedn
+    
+    def _fit(self, *args, **kwds):
+        
+        dist = self.dist
+        data = self.data
+        
+        Narg = len(args)
+        if Narg > dist.numargs:
+                raise ValueError, "Too many input arguments."
+        start = [None]*2
+        if (Narg < dist.numargs) or not (kwds.has_key('loc') and
+                                         kwds.has_key('scale')):
+            start = dist._fitstart(data)  # get distribution specific starting locations
+            args += start[Narg:-2]
+        loc = kwds.get('loc', start[-2])
+        scale = kwds.get('scale', start[-1])
+        args += (loc, scale)
+        x0, func, restore, args, fixedn = self._reduce_func(args, kwds)
+        if self.search:
+            optimizer = kwds.get('optimizer', optimize.fmin)
+            # convert string to function in scipy.optimize
+            if not callable(optimizer) and isinstance(optimizer, (str, unicode)):
+                if not optimizer.startswith('fmin_'):
+                    optimizer = "fmin_"+optimizer
+                if optimizer == 'fmin_': 
+                    optimizer = 'fmin'
+                try:
+                    optimizer = getattr(optimize, optimizer)
+                except AttributeError:
+                    raise ValueError, "%s is not a valid optimizer" % optimizer
+            vals = optimizer(func,x0,args=(ravel(data),),disp=0)
+            vals = tuple(vals)
+        else:
+            vals = tuple(x0)
+        if restore is not None:
+            vals = restore(args, vals)
+        return vals, fixedn
+    
+    def _compute_cov(self):
+        '''Compute covariance
+        '''
+        somefixed = (self.par_fix != None) and any(isfinite(self.par_fix))
+        H = numpy.asmatrix(self.dist.hessian_nnlf(self.par, self.data))
        self.H = H
        try:
+            if somefixed:
+                allfixed = all(isfinite(self.par_fix))
                if allfixed:
-                pass
-            elif somefixed:
+                    self.par_cov[:,:]=0
+                else:
                    pcov = -pinv2(H[self.i_notfixed, :][..., self.i_notfixed])
                    for row, ix in enumerate(list(self.i_notfixed)):
                        self.par_cov[ix, self.i_notfixed] = pcov[row, :]
-
            else:
                self.par_cov = -pinv2(H)
        except:
            self.par_cov[:, :] = nan
            
-        pvar = numpy.diag(self.par_cov)
-        zcrit = -norm_ppf(self.alpha / 2.0)#_WAFODIST.norm.ppf(self.alpha / 2.0)
-        self.par_lower = self.par - zcrit * sqrt(pvar)
-        self.par_upper = self.par + zcrit * sqrt(pvar)
-
    def fitfun(self, phat):
        return self._fitfun(phat, self.data)

@ -562,7 +613,6 @@ class FitDistribution(rv_frozen):
        self.par[self.i_notfixed] = phat10
        return self._fitfun(self.par, self.data)

-
    def profile(self, **kwds):
        ''' Profile Log- likelihood or Log Product Spacing- function,
            which can be used for constructing confidence interval for
@ -812,65 +862,7 @@ class FitDistribution(rv_frozen):
        return pvalue


-    def _hessian_nnlf(self, theta, data, eps=None):
-        ''' approximate hessian of nnlf where theta are the parameters (including loc and scale)
-        '''
-        #Nd = len(x)
-        np = len(theta)
-        # pab 07.01.2001: Always choose the stepsize h so that
-        # it is an exactly representable number.
-        # This is important when calculating numerical derivatives and is
-        #  accomplished by the following.
-
-        if eps == None:
-            eps = (floatinfo.machar.eps) ** 0.4
-        #xmin = floatinfo.machar.xmin
-        #myfun = lambda y: max(y,100.0*log(xmin)) #% trick to avoid log of zero
-        delta = (eps + 2.0) - 2.0
-        delta2 = delta ** 2.0
-        #    % Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with
-        #    %             1/(d^2 L(theta|x)/dtheta^2)
-        #    %  using central differences
 
-        dist = self.dist
-        LL = dist.nnlf(theta, data)
-        H = zeros((np, np))   #%% Hessian matrix
-        theta = tuple(theta)
-        for ix in xrange(np):
-            sparam = list(theta)
-            sparam[ix] = theta[ix] + delta
-            fp = dist.nnlf(sparam, data)
-            #fp = sum(myfun(x))
-
-            sparam[ix] = theta[ix] - delta
-            fm = dist.nnlf(sparam, data)
-            #fm = sum(myfun(x))
-
-            H[ix, ix] = (fp - 2 * LL + fm) / delta2
-            for iy in range(ix + 1, np):
-                sparam[ix] = theta[ix] + delta
-                sparam[iy] = theta[iy] + delta
-                fpp = dist.nnlf(sparam, data)
-                #fpp = sum(myfun(x))
-
-                sparam[iy] = theta[iy] - delta
-                fpm = dist.nnlf(sparam, data)
-                #fpm = sum(myfun(x))
-
-                sparam[ix] = theta[ix] - delta
-                fmm = dist.nnlf(sparam, data)
-                #fmm = sum(myfun(x));
-
-                sparam[iy] = theta[iy] + delta
-                fmp = dist.nnlf(sparam, data)
-                #fmp = sum(myfun(x))
-                H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2)
-                H[iy, ix] = H[ix, iy]
-                sparam[iy] = theta[iy];
-
-        # invert the Hessian matrix (i.e. invert the observed information number)
-        #pcov = -pinv(H);
-        return - H

 def main():
    _WAFODIST = ppimport('wafo.stats.distributions')
--- a/pywafo/src/wafo/wafodata.py
+++ b/pywafo/src/wafo/wafodata.py
@ -52,13 +52,13 @@ class WafoData(object):
    >>> h = d2.plot()

    Plot with confidence interval
-    d3 = wdata(sin(x),x)
-    d3 = set(d3,'dataCI',[sin(x(:))*0.9 sin(x(:))*1.2])
-    plot(d3)
+    >>> d3 = WafoData(np.sin(x),x)
+    >>> d3.children = [WafoData(np.vstack([np.sin(x)*0.9, np.sin(x)*1.2]).T,x)]
+    >>> d3.children_args=[':r']
+    >>> h = d3.plot()

    See also
    --------
-    wdata/plot,
    specdata,
    covdata
    '''
@ -68,6 +68,7 @@ class WafoData(object):
        self.date = now()
        self.plotter = None
        self.children = None
+        self.children_args = []
        self.labels = AxisLabels(**kwds)
        self.setplotter()

@ -76,8 +77,9 @@ class WafoData(object):
        if self.children != None:
            plotbackend.hold('on')
            tmp = []
+            child_args = args + tuple(self.children_args)
            for child in self.children:
-                tmp1 = child.plot(*args, **kwds)
+                tmp1 = child.plot(*child_args, **kwds)
                if tmp1 != None:
                    tmp.append(tmp1)
            if len(tmp) == 0: