|
|
|
from __future__ import division
|
|
|
|
import warnings
|
|
|
|
from wafo.containers import PlotData
|
|
|
|
from wafo.misc import findextrema
|
|
|
|
from scipy import special
|
|
|
|
import numpy as np
|
|
|
|
from numpy import inf
|
|
|
|
from numpy import atleast_1d, nan, ndarray, sqrt, vstack, ones, where, zeros
|
|
|
|
# , reshape, repeat, product
|
|
|
|
from numpy import arange, floor, linspace, asarray
|
|
|
|
from time import gmtime, strftime
|
|
|
|
|
|
|
|
|
|
|
|
__all__ = [
|
|
|
|
'edf', 'edfcnd', 'reslife', 'dispersion_idx', 'decluster', 'findpot',
|
|
|
|
'declustering_time', 'interexceedance_times', 'extremal_idx']
|
|
|
|
|
|
|
|
arr = asarray
|
|
|
|
|
|
|
|
|
|
|
|
def now():
|
|
|
|
"""
|
|
|
|
Return current date and time as a string
|
|
|
|
"""
|
|
|
|
return strftime("%a, %d %b %Y %H:%M:%S", gmtime())
|
|
|
|
|
|
|
|
|
|
|
|
def valarray(shape, value=nan, typecode=None):
|
|
|
|
"""Return an array of all value.
|
|
|
|
"""
|
|
|
|
out = ones(shape, dtype=bool) * value
|
|
|
|
if typecode is not None:
|
|
|
|
out = out.astype(typecode)
|
|
|
|
if not isinstance(out, ndarray):
|
|
|
|
out = arr(out)
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
def _cdff(x, dfn, dfd):
|
|
|
|
return special.fdtr(dfn, dfd, x)
|
|
|
|
|
|
|
|
|
|
|
|
def _cdft(x, df):
|
|
|
|
return special.stdtr(df, x)
|
|
|
|
|
|
|
|
|
|
|
|
def _invt(q, df):
|
|
|
|
return special.stdtrit(df, q)
|
|
|
|
|
|
|
|
|
|
|
|
def _cdfchi2(x, df):
|
|
|
|
return special.chdtr(df, x)
|
|
|
|
|
|
|
|
|
|
|
|
def _invchi2(q, df):
|
|
|
|
return special.chdtri(df, q)
|
|
|
|
|
|
|
|
|
|
|
|
def _cdfnorm(x):
|
|
|
|
return special.ndtr(x)
|
|
|
|
|
|
|
|
|
|
|
|
def _invnorm(q):
|
|
|
|
return special.ndtri(q)
|
|
|
|
|
|
|
|
|
|
|
|
def edf(x, method=2):
|
|
|
|
"""
|
|
|
|
Returns Empirical Distribution Function (EDF).
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
x : array-like
|
|
|
|
data vector
|
|
|
|
method : integer scalar
|
|
|
|
1. Interpolation so that F(X_(k)) == (k-0.5)/n.
|
|
|
|
2. Interpolation so that F(X_(k)) == k/(n+1). (default)
|
|
|
|
3. The empirical distribution. F(X_(k)) = k/n
|
|
|
|
|
|
|
|
Example
|
|
|
|
-------
|
|
|
|
>>> import wafo.stats as ws
|
|
|
|
>>> x = np.linspace(0,6,200)
|
|
|
|
>>> R = ws.rayleigh.rvs(scale=2,size=100)
|
|
|
|
>>> F = ws.edf(R)
|
|
|
|
>>> h = F.plot()
|
|
|
|
|
|
|
|
See also edf, pdfplot, cumtrapz
|
|
|
|
"""
|
|
|
|
z = atleast_1d(x)
|
|
|
|
z.sort()
|
|
|
|
|
|
|
|
n = len(z)
|
|
|
|
if method == 1:
|
|
|
|
e_cdf = arange(0.5, n) / n
|
|
|
|
elif method == 3:
|
|
|
|
e_cdf = arange(1, n + 1) / n
|
|
|
|
else:
|
|
|
|
e_cdf = arange(1, n + 1) / (n + 1)
|
|
|
|
|
|
|
|
return PlotData(e_cdf, z, xlab='x', ylab='F(x)', plotmethod='step')
|
|
|
|
|
|
|
|
|
|
|
|
def edfcnd(x, c=None, method=2):
|
|
|
|
"""
|
|
|
|
Returns empirical Distribution Function CoNDitioned that X>=c (EDFCND).
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
x : array-like
|
|
|
|
data vector
|
|
|
|
method : integer scalar
|
|
|
|
1. Interpolation so that cdf(X_(k)) == (k-0.5)/n.
|
|
|
|
2. Interpolation so that cdf(X_(k)) == k/(n+1). (default)
|
|
|
|
3. The empirical distribution. cdf(X_(k)) = k/n
|
|
|
|
|
|
|
|
Example
|
|
|
|
-------
|
|
|
|
>>> import wafo.stats as ws
|
|
|
|
>>> x = np.linspace(0,6,200)
|
|
|
|
>>> R = ws.rayleigh.rvs(scale=2,size=100)
|
|
|
|
>>> Fc = ws.edfcnd(R, 1)
|
|
|
|
>>> hc = Fc.plot()
|
|
|
|
>>> cdf = ws.edf(R)
|
|
|
|
>>> h = cdf.plot()
|
|
|
|
|
|
|
|
See also edf, pdfplot, cumtrapz
|
|
|
|
"""
|
|
|
|
z = atleast_1d(x)
|
|
|
|
if c is None:
|
|
|
|
c = floor(min(z.min(), 0))
|
|
|
|
|
|
|
|
cdf = edf(z[c <= z], method=method)
|
|
|
|
|
|
|
|
if - inf < c:
|
|
|
|
cdf.labels.ylab = 'F(x| X>=%g)' % c
|
|
|
|
|
|
|
|
return cdf
|
|
|
|
|
|
|
|
|
|
|
|
def _check_nmin(nmin, n):
|
|
|
|
nmin = max(nmin, 1)
|
|
|
|
if 2 * nmin > n:
|
|
|
|
warnings.warn('nmin possibly too large!')
|
|
|
|
return nmin
|
|
|
|
|
|
|
|
|
|
|
|
def _check_umin_umax(data, umin, umax, nmin):
|
|
|
|
sd = np.sort(data)
|
|
|
|
sdmax, sdmin = sd[-nmin], sd[0]
|
|
|
|
umax = sdmax if umax is None else min(umax, sdmax)
|
|
|
|
umin = sdmin if umin is None else max(umin, sdmin)
|
|
|
|
return umin, umax
|
|
|
|
|
|
|
|
|
|
|
|
def _check_nu(nu, nmin, n):
|
|
|
|
if nu is None:
|
|
|
|
nu = min(n - nmin, 100)
|
|
|
|
return nu
|
|
|
|
|
|
|
|
|
|
|
|
def reslife(data, u=None, umin=None, umax=None, nu=None, nmin=3, alpha=0.05,
|
|
|
|
plotflag=False):
|
|
|
|
"""
|
|
|
|
Return Mean Residual Life, i.e., mean excesses vs thresholds
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
---------
|
|
|
|
data : array_like
|
|
|
|
vector of data of length N.
|
|
|
|
u : array-like
|
|
|
|
threshold values (default linspace(umin, umax, nu))
|
|
|
|
umin, umax : real scalars
|
|
|
|
Minimum and maximum threshold, respectively
|
|
|
|
(default min(data), max(data)).
|
|
|
|
nu : scalar integer
|
|
|
|
number of threshold values (default min(N-nmin,100))
|
|
|
|
nmin : scalar integer
|
|
|
|
Minimum number of extremes to include. (Default 3).
|
|
|
|
alpha : real scalar
|
|
|
|
Confidence coefficient (default 0.05)
|
|
|
|
plotflag: bool
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
mrl : PlotData object
|
|
|
|
Mean residual life values, i.e., mean excesses over thresholds, u.
|
|
|
|
|
|
|
|
Notes
|
|
|
|
-----
|
|
|
|
RESLIFE estimate mean excesses over thresholds. The purpose of MRL is
|
|
|
|
to determine the threshold where the upper tail of the data can be
|
|
|
|
approximated with the generalized Pareto distribution (GPD). The GPD is
|
|
|
|
appropriate for the tail, if the MRL is a linear function of the
|
|
|
|
threshold, u. Theoretically in the GPD model
|
|
|
|
|
|
|
|
E(X-u0|X>u0) = s0/(1+k)
|
|
|
|
E(X-u |X>u) = s/(1+k) = (s0 -k*u)/(1+k) for u>u0
|
|
|
|
|
|
|
|
where k,s is the shape and scale parameter, respectively.
|
|
|
|
s0 = scale parameter for threshold u0<u.
|
|
|
|
|
|
|
|
Example
|
|
|
|
-------
|
|
|
|
>>> import wafo
|
|
|
|
>>> R = wafo.stats.genpareto.rvs(0.1,2,2,size=100)
|
|
|
|
>>> mrl = reslife(R,nu=20)
|
|
|
|
>>> h = mrl.plot()
|
|
|
|
|
|
|
|
See also
|
|
|
|
---------
|
|
|
|
genpareto
|
|
|
|
fitgenparrange, disprsnidx
|
|
|
|
"""
|
|
|
|
if u is None:
|
|
|
|
n = len(data)
|
|
|
|
nmin = _check_nmin(nmin, n)
|
|
|
|
umin, umax = _check_umin_umax(data, umin, umax, nmin)
|
|
|
|
nu = _check_nu(nu, nmin, n)
|
|
|
|
u = linspace(umin, umax, nu)
|
|
|
|
|
|
|
|
nu = len(u)
|
|
|
|
|
|
|
|
# mrl1 = valarray(nu)
|
|
|
|
# srl = valarray(nu)
|
|
|
|
# num = valarray(nu)
|
|
|
|
|
|
|
|
def mean_and_std(data1):
|
|
|
|
return data1.mean(), data1.std(), data1.size
|
|
|
|
|
|
|
|
dat = arr(data)
|
|
|
|
tmp = arr([mean_and_std(dat[dat > tresh] - tresh) for tresh in u.tolist()])
|
|
|
|
|
|
|
|
mrl, srl, num = tmp.T
|
|
|
|
p = 1 - alpha
|
|
|
|
alpha2 = alpha / 2
|
|
|
|
|
|
|
|
# Approximate P% confidence interval
|
|
|
|
# z_a = -invnorm(alpha2); % known mean
|
|
|
|
z_a = -_invt(alpha2, num - 1) # unknown mean
|
|
|
|
mrlu = mrl + z_a * srl / sqrt(num)
|
|
|
|
mrll = mrl - z_a * srl / sqrt(num)
|
|
|
|
|
|
|
|
title_txt = 'Mean residual life with %d%s CI' % (100 * p, '%')
|
|
|
|
res = PlotData(mrl, u, xlab='Threshold',
|
|
|
|
ylab='Mean Excess', title=title_txt)
|
|
|
|
res.workspace = dict(
|
|
|
|
numdata=num, umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha)
|
|
|
|
res.children = [
|
|
|
|
PlotData(vstack([mrll, mrlu]).T, u, xlab='Threshold', title=title_txt)]
|
|
|
|
res.plot_args_children = [':r']
|
|
|
|
if plotflag:
|
|
|
|
res.plot()
|
|
|
|
return res
|
|
|
|
|
|
|
|
|
|
|
|
def dispersion_idx(
|
|
|
|
data, t=None, u=None, umin=None, umax=None, nu=None, nmin=10, tb=1,
|
|
|
|
alpha=0.05, plotflag=False):
|
|
|
|
"""Return Dispersion Index vs threshold
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
data, ti : array_like
|
|
|
|
data values and sampled times, respectively.
|
|
|
|
u : array-like
|
|
|
|
threshold values (default linspace(umin, umax, nu))
|
|
|
|
umin, umax : real scalars
|
|
|
|
Minimum and maximum threshold, respectively
|
|
|
|
(default min(data), max(data)).
|
|
|
|
nu : scalar integer
|
|
|
|
number of threshold values (default min(N-nmin,100))
|
|
|
|
nmin : scalar integer
|
|
|
|
Minimum number of extremes to include. (Default 10).
|
|
|
|
tb : Real scalar
|
|
|
|
Block period (same unit as the sampled times) (default 1)
|
|
|
|
alpha : real scalar
|
|
|
|
Confidence coefficient (default 0.05)
|
|
|
|
plotflag: bool
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
DI : PlotData object
|
|
|
|
Dispersion index
|
|
|
|
b_u : real scalar
|
|
|
|
threshold where the number of exceedances in a fixed period (Tb) is
|
|
|
|
consistent with a Poisson process.
|
|
|
|
ok_u : array-like
|
|
|
|
all thresholds where the number of exceedances in a fixed period (Tb)
|
|
|
|
is consistent with a Poisson process.
|
|
|
|
|
|
|
|
Notes
|
|
|
|
------
|
|
|
|
DISPRSNIDX estimate the Dispersion Index (DI) as function of threshold.
|
|
|
|
DI measures the homogenity of data and the purpose of DI is to determine
|
|
|
|
the threshold where the number of exceedances in a fixed period (Tb) is
|
|
|
|
consistent with a Poisson process. For a Poisson process the DI is one.
|
|
|
|
Thus the threshold should be so high that DI is not significantly
|
|
|
|
different from 1.
|
|
|
|
|
|
|
|
The Poisson hypothesis is not rejected if the estimated DI is between:
|
|
|
|
|
|
|
|
chi2(alpha/2, M-1)/(M-1)< DI < chi^2(1 - alpha/2, M-1 }/(M - 1)
|
|
|
|
|
|
|
|
where M is the total number of fixed periods/blocks -generally
|
|
|
|
the total number of years in the sample.
|
|
|
|
|
|
|
|
Example
|
|
|
|
-------
|
|
|
|
>>> import wafo.data
|
|
|
|
>>> xn = wafo.data.sea()
|
|
|
|
>>> t, data = xn.T
|
|
|
|
>>> Ie = findpot(data,t,0,5);
|
|
|
|
>>> di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100)
|
|
|
|
>>> h = di.plot() # a threshold around 1 seems appropriate.
|
|
|
|
>>> round(u*100)/100
|
|
|
|
1.03
|
|
|
|
|
|
|
|
vline(u)
|
|
|
|
|
|
|
|
See also
|
|
|
|
--------
|
|
|
|
reslife,
|
|
|
|
fitgenparrange,
|
|
|
|
extremal_idx
|
|
|
|
|
|
|
|
References
|
|
|
|
----------
|
|
|
|
Ribatet, M. A.,(2006),
|
|
|
|
A User's Guide to the POT Package (Version 1.0)
|
|
|
|
month = {August},
|
|
|
|
url = {http://cran.r-project.org/}
|
|
|
|
|
|
|
|
Cunnane, C. (1979) Note on the poisson assumption in
|
|
|
|
partial duration series model. Water Resource Research, 15\bold{(2)}
|
|
|
|
:489--494.}
|
|
|
|
"""
|
|
|
|
def _find_appropriate_threshold(u, di, di_low, di_up):
|
|
|
|
k1, = np.where((di_low < di) & (di < di_up))
|
|
|
|
if len(k1) > 0:
|
|
|
|
ok_u = u[k1]
|
|
|
|
b_di = di[k1].mean() < di[k1]
|
|
|
|
k = b_di.argmax()
|
|
|
|
b_u = ok_u[k]
|
|
|
|
else:
|
|
|
|
b_u = ok_u = None
|
|
|
|
return b_u, ok_u
|
|
|
|
|
|
|
|
n = len(data)
|
|
|
|
if t is None:
|
|
|
|
ti = arange(n)
|
|
|
|
else:
|
|
|
|
ti = arr(t) - min(t)
|
|
|
|
|
|
|
|
t1 = np.empty(ti.shape, dtype=int)
|
|
|
|
t1[:] = np.floor(ti / tb)
|
|
|
|
|
|
|
|
if u is None:
|
|
|
|
nmin = _check_nmin(nmin, n)
|
|
|
|
umin, umax = _check_umin_umax(data, umin, umax, nmin)
|
|
|
|
nu = _check_nu(nu, nmin, n)
|
|
|
|
u = linspace(umin, umax, nu)
|
|
|
|
|
|
|
|
nu = len(u)
|
|
|
|
|
|
|
|
di = np.zeros(nu)
|
|
|
|
|
|
|
|
d = arr(data)
|
|
|
|
|
|
|
|
mint = int(min(t1)) # should be 0.
|
|
|
|
maxt = int(max(t1))
|
|
|
|
M = maxt - mint + 1
|
|
|
|
occ = np.zeros(M)
|
|
|
|
|
|
|
|
for ix, tresh in enumerate(u):
|
|
|
|
excess = (d > tresh)
|
|
|
|
lambda_ = excess.sum() / M
|
|
|
|
for block in range(M):
|
|
|
|
occ[block] = sum(excess[t1 == block])
|
|
|
|
|
|
|
|
di[ix] = occ.var() / lambda_
|
|
|
|
|
|
|
|
p = 1.0 - alpha
|
|
|
|
|
|
|
|
di_low = _invchi2(1 - alpha / 2, M - 1) / (M - 1)
|
|
|
|
di_up = _invchi2(alpha / 2, M - 1) / (M - 1)
|
|
|
|
|
|
|
|
b_u, ok_u = _find_appropriate_threshold(u, di, di_low, di_up)
|
|
|
|
|
|
|
|
ci_txt = '{0:d}{1} CI'.format(100 * p, '%')
|
|
|
|
title_txt = 'Dispersion Index plot'
|
|
|
|
|
|
|
|
res = PlotData(di, u, title=title_txt,
|
|
|
|
labx='Threshold', laby='Dispersion Index')
|
|
|
|
|
|
|
|
res.workspace = dict(umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha)
|
|
|
|
res.children = [
|
|
|
|
PlotData(vstack([di_low * ones(nu), di_up * ones(nu)]).T, u,
|
|
|
|
xlab='Threshold', title=ci_txt)]
|
|
|
|
res.plot_args_children = ['--r']
|
|
|
|
if plotflag:
|
|
|
|
res.plot(di)
|
|
|
|
return res, b_u, ok_u
|
|
|
|
|
|
|
|
|
|
|
|
def decluster(data, t=None, thresh=None, tmin=1):
|
|
|
|
"""
|
|
|
|
Return declustered peaks over threshold values
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
data, t : array-like
|
|
|
|
data-values and sampling-times, respectively.
|
|
|
|
thresh : real scalar
|
|
|
|
minimum threshold for levels in data.
|
|
|
|
tmin : real scalar
|
|
|
|
minimum distance to another peak [same unit as t] (default 1)
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
ev, te : ndarray
|
|
|
|
extreme values and its corresponding sampling times, respectively,
|
|
|
|
i.e., all data > thresh which are at least tmin distance apart.
|
|
|
|
|
|
|
|
Example
|
|
|
|
-------
|
|
|
|
>>> import pylab
|
|
|
|
>>> import wafo.data
|
|
|
|
>>> from wafo.misc import findtc
|
|
|
|
>>> x = wafo.data.sea()
|
|
|
|
>>> t, data = x[:400,:].T
|
|
|
|
>>> itc, iv = findtc(data,0,'dw')
|
|
|
|
>>> ytc, ttc = data[itc], t[itc]
|
|
|
|
>>> ymin = 2*data.std()
|
|
|
|
>>> tmin = 10 # sec
|
|
|
|
>>> [ye, te] = decluster(ytc,ttc, ymin,tmin);
|
|
|
|
>>> h = pylab.plot(t,data,ttc,ytc,'ro',t,zeros(len(t)),':',te,ye,'k.')
|
|
|
|
|
|
|
|
See also
|
|
|
|
--------
|
|
|
|
fitgenpar, findpot, extremalidx
|
|
|
|
"""
|
|
|
|
if t is None:
|
|
|
|
t = np.arange(len(data))
|
|
|
|
i = findpot(data, t, thresh, tmin)
|
|
|
|
return data[i], t[i]
|
|
|
|
|
|
|
|
|
|
|
|
def _remove_index_to_data_too_close_to_each_other(ix_e, is_too_small, di_e, ti_e, tmin):
|
|
|
|
is_too_close = np.hstack((is_too_small[0], is_too_small[:-1] | is_too_small[1:],
|
|
|
|
is_too_small[-1]))
|
|
|
|
# Find opening (no) and closing (nc) index for data beeing to close:
|
|
|
|
iy = findextrema(np.hstack([0, 0, is_too_small, 0]))
|
|
|
|
no = iy[:2] - 1
|
|
|
|
nc = iy[1::2]
|
|
|
|
for start, stop in zip(no, nc):
|
|
|
|
iz = slice(start, stop)
|
|
|
|
i_ok = _find_ok_peaks(di_e[iz], ti_e[iz], tmin)
|
|
|
|
if len(i_ok):
|
|
|
|
is_too_close[start + i_ok] = 0
|
|
|
|
|
|
|
|
# Remove data which is too close to other data.
|
|
|
|
if is_too_close.any():
|
|
|
|
i_ok, = where(1 - is_too_close)
|
|
|
|
ix_e = ix_e[i_ok]
|
|
|
|
return ix_e
|
|
|
|
|
|
|
|
|
|
|
|
def findpot(data, t=None, thresh=None, tmin=1):
|
|
|
|
"""
|
|
|
|
Retrun indices to Peaks over threshold values
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
data, t : array-like
|
|
|
|
data-values and sampling-times, respectively.
|
|
|
|
thresh : real scalar
|
|
|
|
minimum threshold for levels in data.
|
|
|
|
tmin : real scalar
|
|
|
|
minimum distance to another peak [same unit as t] (default 1)
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
ix_e : ndarray
|
|
|
|
indices to extreme values, i.e., all data > tresh which are at least
|
|
|
|
tmin distance apart.
|
|
|
|
|
|
|
|
Example
|
|
|
|
-------
|
|
|
|
>>> import pylab
|
|
|
|
>>> import wafo.data
|
|
|
|
>>> from wafo.misc import findtc
|
|
|
|
>>> x = wafo.data.sea()
|
|
|
|
>>> t, data = x.T
|
|
|
|
>>> itc, iv = findtc(data,0,'dw')
|
|
|
|
>>> ytc, ttc = data[itc], t[itc]
|
|
|
|
>>> ymin = 2*data.std()
|
|
|
|
>>> tmin = 10 # sec
|
|
|
|
>>> i = findpot(data, t, ymin, tmin)
|
|
|
|
>>> yp, tp = data[i], t[i]
|
|
|
|
>>> ix_e = findpot(yp, tp, ymin,tmin)
|
|
|
|
>>> ye, te = yp[ix_e], tp[ix_e]
|
|
|
|
>>> h = pylab.plot(t,data,ttc,ytc,'ro',
|
|
|
|
... t,zeros(len(t)),':',
|
|
|
|
... te, ye,'k.',tp,yp,'+')
|
|
|
|
|
|
|
|
See also
|
|
|
|
--------
|
|
|
|
fitgenpar, decluster, extremalidx
|
|
|
|
"""
|
|
|
|
data = arr(data)
|
|
|
|
if t is None:
|
|
|
|
t = np.arange(len(data))
|
|
|
|
else:
|
|
|
|
t = arr(t)
|
|
|
|
|
|
|
|
ix_e, = where(data > thresh)
|
|
|
|
di_e = data[ix_e]
|
|
|
|
ti_e = t[ix_e]
|
|
|
|
if len(di_e) <= 1:
|
|
|
|
return ix_e
|
|
|
|
|
|
|
|
dt = np.diff(ti_e)
|
|
|
|
not_sorted = np.any(dt < 0)
|
|
|
|
if not_sorted:
|
|
|
|
i = np.argsort(ti_e)
|
|
|
|
ti_e = ti_e[i]
|
|
|
|
ix_e = ix_e[i]
|
|
|
|
di_e = di_e[i]
|
|
|
|
dt = np.diff(ti_e)
|
|
|
|
|
|
|
|
is_too_small = (dt <= tmin)
|
|
|
|
|
|
|
|
if np.any(is_too_small):
|
|
|
|
ix_e = _remove_index_to_data_too_close_to_each_other(ix_e, is_too_small, di_e, ti_e, tmin)
|
|
|
|
|
|
|
|
return ix_e
|
|
|
|
|
|
|
|
|
|
|
|
def _find_ok_peaks(y, t, t_min):
|
|
|
|
"""
|
|
|
|
Return indices to the largest maxima that are at least t_min distance apart.
|
|
|
|
"""
|
|
|
|
num_y = len(y)
|
|
|
|
|
|
|
|
i = np.argsort(-y) # sort in descending order
|
|
|
|
|
|
|
|
tis = t[i]
|
|
|
|
o_order = zeros(num_y, dtype=int)
|
|
|
|
o_order[i] = range(num_y) # indices to the variables original location
|
|
|
|
|
|
|
|
is_too_close = zeros(num_y, dtype=bool)
|
|
|
|
|
|
|
|
pool = zeros((num_y, 2))
|
|
|
|
t_range = np.hstack([-t_min, t_min])
|
|
|
|
k = 0
|
|
|
|
for i, ti in enumerate(tis):
|
|
|
|
is_too_close[i] = np.any((pool[:k, 0] <= ti) & (ti <= pool[:k, 1]))
|
|
|
|
if not is_too_close[i]:
|
|
|
|
pool[k] = ti + t_range
|
|
|
|
k += 1
|
|
|
|
|
|
|
|
i_ok, = where(1 - is_too_close[o_order])
|
|
|
|
return i_ok
|
|
|
|
|
|
|
|
|
|
|
|
def declustering_time(t):
|
|
|
|
"""
|
|
|
|
Returns minimum distance between clusters.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
t : array-like
|
|
|
|
sampling times for data.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
tc : real scalar
|
|
|
|
minimum distance between clusters.
|
|
|
|
|
|
|
|
Example
|
|
|
|
-------
|
|
|
|
>>> import wafo.data
|
|
|
|
>>> x = wafo.data.sea()
|
|
|
|
>>> t, data = x[:400,:].T
|
|
|
|
>>> Ie = findpot(data,t,0,5)
|
|
|
|
>>> tc = declustering_time(Ie)
|
|
|
|
>>> tc
|
|
|
|
21
|
|
|
|
"""
|
|
|
|
t0 = arr(t)
|
|
|
|
nt = len(t0)
|
|
|
|
if nt < 2:
|
|
|
|
return arr([])
|
|
|
|
ti = interexceedance_times(t0)
|
|
|
|
ei = extremal_idx(ti)
|
|
|
|
if ei == 1:
|
|
|
|
tc = ti.min()
|
|
|
|
else:
|
|
|
|
i = int(np.floor(nt * ei))
|
|
|
|
sti = -np.sort(-ti)
|
|
|
|
tc = sti[min(i, nt - 2)] # % declustering time
|
|
|
|
return tc
|
|
|
|
|
|
|
|
|
|
|
|
def interexceedance_times(t):
|
|
|
|
"""
|
|
|
|
Returns interexceedance times of data
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
t : array-like
|
|
|
|
sampling times for data.
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
ti : ndarray
|
|
|
|
interexceedance times
|
|
|
|
|
|
|
|
Example
|
|
|
|
-------
|
|
|
|
>>> t = [1,2,5,10]
|
|
|
|
>>> interexceedance_times(t)
|
|
|
|
array([1, 3, 5])
|
|
|
|
|
|
|
|
"""
|
|
|
|
return np.diff(np.sort(t))
|
|
|
|
|
|
|
|
|
|
|
|
def extremal_idx(ti):
|
|
|
|
"""
|
|
|
|
Returns Extremal Index measuring the dependence of data
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
ti : array-like
|
|
|
|
interexceedance times for data.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
ei : real scalar
|
|
|
|
Extremal index.
|
|
|
|
|
|
|
|
Notes
|
|
|
|
-----
|
|
|
|
The Extremal Index (EI) is one if the data are independent and less than
|
|
|
|
one if there are some dependence. The extremal index can also be intepreted
|
|
|
|
as the reciprocal of the mean cluster size.
|
|
|
|
|
|
|
|
Example
|
|
|
|
-------
|
|
|
|
>>> import wafo.data
|
|
|
|
>>> x = wafo.data.sea()
|
|
|
|
>>> t, data = x[:400,:].T
|
|
|
|
>>> Ie = findpot(data,t,0,5);
|
|
|
|
>>> ti = interexceedance_times(Ie)
|
|
|
|
>>> ei = extremal_idx(ti)
|
|
|
|
>>> ei
|
|
|
|
1
|
|
|
|
|
|
|
|
See also
|
|
|
|
--------
|
|
|
|
reslife, fitgenparrange, disprsnidx, findpot, decluster
|
|
|
|
|
|
|
|
|
|
|
|
Reference
|
|
|
|
---------
|
|
|
|
Christopher A. T. Ferro, Johan Segers (2003)
|
|
|
|
Inference for clusters of extreme values
|
|
|
|
Journal of the Royal Statistical society: Series B
|
|
|
|
(Statistical Methodology) 54 (2), 545-556
|
|
|
|
doi:10.1111/1467-9868.00401
|
|
|
|
"""
|
|
|
|
t = arr(ti)
|
|
|
|
tmax = t.max()
|
|
|
|
if tmax <= 1:
|
|
|
|
ei = 0
|
|
|
|
elif tmax <= 2:
|
|
|
|
ei = min(1, 2 * t.mean() ** 2 / ((t ** 2).mean()))
|
|
|
|
else:
|
|
|
|
ei = min(1, 2 * np.mean(t - 1) ** 2 / np.mean((t - 1) * (t - 2)))
|
|
|
|
return ei
|
|
|
|
|
|
|
|
|
|
|
|
def _logit(p):
|
|
|
|
return np.log(p) - np.log1p(-p)
|
|
|
|
|
|
|
|
|
|
|
|
def _logitinv(x):
|
|
|
|
return 1.0 / (np.exp(-x) + 1)
|
|
|
|
|
|
|
|
|
|
|
|
class RegLogit(object):
|
|
|
|
|
|
|
|
"""
|
|
|
|
REGLOGIT Fit ordinal logistic regression model.
|
|
|
|
|
|
|
|
CALL model = reglogit (options)
|
|
|
|
|
|
|
|
model = fitted model object with methods
|
|
|
|
.compare() : Compare small LOGIT object versus large one
|
|
|
|
.predict() : Predict from a fitted LOGIT object
|
|
|
|
.summary() : Display summary of fitted LOGIT object.
|
|
|
|
|
|
|
|
y = vector of K ordered categories
|
|
|
|
x = column vectors of covariates
|
|
|
|
options = struct defining performance of REGLOGIT
|
|
|
|
.maxiter : maximum number of iterations.
|
|
|
|
.accuracy : accuracy in convergence.
|
|
|
|
.betastart : Start value for BETA (default 0)
|
|
|
|
.thetastart : Start value for THETA (default depends on Y)
|
|
|
|
.alpha : Confidence coefficent (default 0.05)
|
|
|
|
.verbose : 1 display summary info about fitted model
|
|
|
|
2 display convergence info in each iteration
|
|
|
|
otherwise no action
|
|
|
|
.deletecolinear : If true delete colinear covarites (default)
|
|
|
|
|
|
|
|
Methods
|
|
|
|
.predict : Predict from a fitted LOGIT object
|
|
|
|
.summary : Display summary of fitted LOGIT object.
|
|
|
|
.compare : Compare small LOGIT versus large one
|
|
|
|
|
|
|
|
Suppose Y takes values in K ordered categories, and let
|
|
|
|
gamma_i (x) be the cumulative probability that Y
|
|
|
|
falls in one of the first i categories given the covariate
|
|
|
|
X. The ordinal logistic regression model is
|
|
|
|
|
|
|
|
logit (mu_i (x)) = theta_i + beta' * x, i = 1...k-1
|
|
|
|
|
|
|
|
The number of ordinal categories, K, is taken to be the number
|
|
|
|
of distinct values of round (Y). If K equals 2,
|
|
|
|
Y is binary and the model is ordinary logistic regression. The
|
|
|
|
matrix X is assumed to have full column rank.
|
|
|
|
|
|
|
|
Given Y only, theta = REGLOGIT(Y) fits the model with baseline logit odds
|
|
|
|
only.
|
|
|
|
|
|
|
|
Example
|
|
|
|
y=[1 1 2 1 3 2 3 2 3 3]'
|
|
|
|
x = (1:10)'
|
|
|
|
b = reglogit(y,x)
|
|
|
|
b.display() % members and methods
|
|
|
|
b.get() % return members
|
|
|
|
b.summary()
|
|
|
|
[mu,plo,pup] = b.predict();
|
|
|
|
plot(x,mu,'g',x,plo,'r:',x,pup,'r:')
|
|
|
|
|
|
|
|
y2 = [zeros(5,1);ones(5,1)];
|
|
|
|
x1 = [29,30,31,31,32,29,30,31,32,33];
|
|
|
|
x2 = [62,83,74,88,68,41,44,21,50,33];
|
|
|
|
X = [x1;x2].';
|
|
|
|
b2 = reglogit(y2,X);
|
|
|
|
b2.summary();
|
|
|
|
b21 = reglogit(y2,X(:,1));
|
|
|
|
b21.compare(b2)
|
|
|
|
|
|
|
|
See also regglm, reglm, regnonlm
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Original for MATLAB written by Gordon K Smyth <gks@maths.uq.oz.au>,
|
|
|
|
# U of Queensland, Australia, on Nov 19, 1990. Last revision Aug 3,
|
|
|
|
# 1992.
|
|
|
|
#
|
|
|
|
# Author: Gordon K Smyth <gks@maths.uq.oz.au>,
|
|
|
|
# Revised by: pab
|
|
|
|
# -renamed from oridinal to reglogit
|
|
|
|
# -added predict, summary and compare
|
|
|
|
# Description: Ordinal logistic regression
|
|
|
|
#
|
|
|
|
# Uses the auxiliary functions logistic_regression_derivatives and
|
|
|
|
# logistic_regression_likelihood.
|
|
|
|
|
|
|
|
def __init__(self, maxiter=500, accuracy=1e-6, alpha=0.05,
|
|
|
|
deletecolinear=True, verbose=False):
|
|
|
|
|
|
|
|
self.maxiter = maxiter
|
|
|
|
self.accuracy = accuracy
|
|
|
|
self.alpha = alpha
|
|
|
|
self.deletecolinear = deletecolinear
|
|
|
|
self.verbose = False
|
|
|
|
self.family = None
|
|
|
|
self.link = None
|
|
|
|
self.numvar = None
|
|
|
|
self.numobs = None
|
|
|
|
self.numk = None
|
|
|
|
self.df = None
|
|
|
|
self.df_null = None
|
|
|
|
self.params = None
|
|
|
|
self.params_ci = None
|
|
|
|
self.params_cov = None
|
|
|
|
self.params_std = None
|
|
|
|
self.params_corr = None
|
|
|
|
self.params_tstat = None
|
|
|
|
self.params_pvalue = None
|
|
|
|
self.mu = None
|
|
|
|
self.eta = None
|
|
|
|
self.X = None
|
|
|
|
self.Y = None
|
|
|
|
self.theta = None
|
|
|
|
self.beta = None
|
|
|
|
self.residual = None
|
|
|
|
self.residual1d = None
|
|
|
|
self.deviance = None
|
|
|
|
self.deviance_null = None
|
|
|
|
self.d2L = None
|
|
|
|
self.dL = None
|
|
|
|
self.dispersionfit = None
|
|
|
|
self.dispersion = 1
|
|
|
|
self.R2 = None
|
|
|
|
self.R2adj = None
|
|
|
|
self.numiter = None
|
|
|
|
self.converged = None
|
|
|
|
self.note = ''
|
|
|
|
self.date = now()
|
|
|
|
|
|
|
|
def check_xy(self, y, X):
|
|
|
|
y = np.round(np.atleast_2d(y))
|
|
|
|
my = y.shape[0]
|
|
|
|
if X is None:
|
|
|
|
X = np.zeros((my, 0))
|
|
|
|
elif self.deletecolinear:
|
|
|
|
X = np.atleast_2d(X)
|
|
|
|
# Make sure X is full rank
|
|
|
|
s = np.linalg.svd(X)[1]
|
|
|
|
tol = max(X.shape) * np.finfo(s.max()).eps
|
|
|
|
ix = np.flatnonzero(s > tol)
|
|
|
|
iy = np.flatnonzero(s <= tol)
|
|
|
|
if len(ix):
|
|
|
|
X = X[:, ix]
|
|
|
|
txt = [' %d,' % i for i in iy]
|
|
|
|
#txt[-1] = ' %d' % iy[-1]
|
|
|
|
warnings.warn(
|
|
|
|
'Covariate matrix is singular. Removing column(s):%s' %
|
|
|
|
txt)
|
|
|
|
mx = X.shape[0]
|
|
|
|
if (mx != my):
|
|
|
|
raise ValueError(
|
|
|
|
'x and y must have the same number of observations')
|
|
|
|
return y, X
|
|
|
|
|
|
|
|
def fit(self, y, X=None, theta0=None, beta0=None):
|
|
|
|
"""
|
|
|
|
Member variables
|
|
|
|
.df : degrees of freedom for error.
|
|
|
|
.params : estimated model parameters
|
|
|
|
.params_ci : 100(1-alpha)% confidence interval for model parameters
|
|
|
|
.params_tstat : t statistics for model's estimated parameters.
|
|
|
|
.params_pvalue: p value for model's estimated parameters.
|
|
|
|
.params_std : standard errors for estimated parameters
|
|
|
|
.params_corr : correlation matrix for estimated parameters.
|
|
|
|
.mu : fitted values for the model.
|
|
|
|
.eta : linear predictor for the model.
|
|
|
|
.residual : residual for the model (Y-E(Y|X)).
|
|
|
|
.dispersnfit : The estimated error variance
|
|
|
|
.deviance : deviance for the model equal minus twice the
|
|
|
|
log-likelihood.
|
|
|
|
.d2L : Hessian matrix (double derivative of log-likelihood)
|
|
|
|
.dL : First derivative of loglikelihood w.r.t. THETA and BETA.
|
|
|
|
|
|
|
|
"""
|
|
|
|
self.family = 'multinomial'
|
|
|
|
self.link = 'logit'
|
|
|
|
y, X = self.check_xy(y, X)
|
|
|
|
|
|
|
|
# initial calculations
|
|
|
|
tol = self.accuracy
|
|
|
|
incr = 10
|
|
|
|
decr = 2
|
|
|
|
ymin = y.min()
|
|
|
|
ymax = y.max()
|
|
|
|
yrange = ymax - ymin
|
|
|
|
z = (y * ones((1, yrange))) == ((y * 0 + 1) * np.arange(ymin, ymax))
|
|
|
|
z1 = (y * ones((1, yrange))) == (
|
|
|
|
(y * 0 + 1) * np.arange(ymin + 1, ymax + 1))
|
|
|
|
z = z[:, np.flatnonzero(z.any(axis=0))]
|
|
|
|
z1 = z1[:, np.flatnonzero(z1.any(axis=0))]
|
|
|
|
_mz, nz = z.shape
|
|
|
|
_mx, nx = X.shape
|
|
|
|
my, _ny = y.shape
|
|
|
|
|
|
|
|
g = (z.sum(axis=0).cumsum() / my).reshape(-1, 1)
|
|
|
|
theta00 = np.log(g / (1 - g)).ravel()
|
|
|
|
beta00 = np.zeros((nx,))
|
|
|
|
# starting values
|
|
|
|
if theta0 is None:
|
|
|
|
theta0 = theta00
|
|
|
|
|
|
|
|
if beta0 is None:
|
|
|
|
beta0 = beta00
|
|
|
|
|
|
|
|
tb = np.hstack((theta0, beta0))
|
|
|
|
|
|
|
|
# likelihood and derivatives at starting values
|
|
|
|
[dev, dl, d2l] = self.loglike(tb, y, X, z, z1)
|
|
|
|
|
|
|
|
epsilon = np.std(d2l) / 1000
|
|
|
|
if np.any(beta0) or np.any(theta00 != theta0):
|
|
|
|
tb0 = np.vstack((theta00, beta00))
|
|
|
|
nulldev = self.loglike(tb0, y, X, z, z1)[0]
|
|
|
|
else:
|
|
|
|
nulldev = dev
|
|
|
|
|
|
|
|
# maximize likelihood using Levenberg modified Newton's method
|
|
|
|
for i in range(self.maxiter + 1):
|
|
|
|
|
|
|
|
tbold = tb
|
|
|
|
devold = dev
|
|
|
|
tb = tbold - np.linalg.lstsq(d2l, dl)[0]
|
|
|
|
[dev, dl, d2l] = self.loglike(tb, y, X, z, z1)
|
|
|
|
if ((dev - devold) / np.dot(dl, tb - tbold) < 0):
|
|
|
|
epsilon = epsilon / decr
|
|
|
|
else:
|
|
|
|
while ((dev - devold) / np.dot(dl, tb - tbold) > 0):
|
|
|
|
epsilon = epsilon * incr
|
|
|
|
if (epsilon > 1e+15):
|
|
|
|
raise ValueError('epsilon too large')
|
|
|
|
|
|
|
|
tb = tbold - \
|
|
|
|
np.linalg.lstsq(d2l - epsilon * np.eye(d2l.shape), dl)
|
|
|
|
[dev, dl, d2l] = self.loglike(tb, y, X, z, z1)
|
|
|
|
print('epsilon %g' % epsilon)
|
|
|
|
# end %while
|
|
|
|
# end else
|
|
|
|
#[dl, d2l] = logistic_regression_derivatives (X, z, z1, g, g1, p);
|
|
|
|
if (self.verbose > 1):
|
|
|
|
|
|
|
|
print('Iter: %d, Deviance: %8.6f', iter, dev)
|
|
|
|
print('First derivative')
|
|
|
|
print(dl)
|
|
|
|
print('Eigenvalues of second derivative')
|
|
|
|
print(np.linalg.eig(d2l)[0].T)
|
|
|
|
# end
|
|
|
|
# end
|
|
|
|
stop = np.abs(
|
|
|
|
np.dot(dl, np.linalg.lstsq(d2l, dl)[0]) / len(dl)) <= tol
|
|
|
|
if stop:
|
|
|
|
break
|
|
|
|
# end %while
|
|
|
|
|
|
|
|
# tidy up output
|
|
|
|
|
|
|
|
theta = tb[:nz, ]
|
|
|
|
beta = tb[nz:(nz + nx)]
|
|
|
|
pcov = np.linalg.pinv(-d2l)
|
|
|
|
se = sqrt(np.diag(pcov))
|
|
|
|
|
|
|
|
if (nx > 0):
|
|
|
|
eta = ((X * beta) * ones((1, nz))) + ((y * 0 + 1) * theta)
|
|
|
|
else:
|
|
|
|
eta = (y * 0 + 1) * theta
|
|
|
|
# end
|
|
|
|
gammai = np.diff(
|
|
|
|
np.hstack(((y * 0), _logitinv(eta), (y * 0 + 1))), n=1, axis=1)
|
|
|
|
k0 = min(y)
|
|
|
|
mu = (k0 - 1) + np.dot(gammai, np.arange(1, nz + 2)).reshape(-1, 1)
|
|
|
|
r = np.corrcoef(np.hstack((y, mu)).T)
|
|
|
|
R2 = r[0, 1] ** 2
|
|
|
|
# coefficient of determination
|
|
|
|
# adjusted coefficient of determination
|
|
|
|
R2adj = max(1 - (1 - R2) * (my - 1) / (my - nx - nz - 1), 0)
|
|
|
|
|
|
|
|
res = y - mu
|
|
|
|
|
|
|
|
if nz == 1:
|
|
|
|
self.family = 'binomial'
|
|
|
|
else:
|
|
|
|
self.family = 'multinomial'
|
|
|
|
|
|
|
|
self.link = 'logit'
|
|
|
|
|
|
|
|
self.numvar = nx + nz
|
|
|
|
self.numobs = my
|
|
|
|
self.numk = nz + 1
|
|
|
|
self.df = max(my - nx - nz, 0)
|
|
|
|
self.df_null = my - nz
|
|
|
|
# nulldf; nulldf = n - nz
|
|
|
|
self.params = tb[:(nz + nx)]
|
|
|
|
self.params_ci = 1
|
|
|
|
self.params_std = se
|
|
|
|
self.params_cov = pcov
|
|
|
|
self.params_tstat = (self.params / self.params_std)
|
|
|
|
# options.estdispersn dispersion_parameter=='mean_deviance'
|
|
|
|
if False:
|
|
|
|
self.params_pvalue = 2. * _cdft(-abs(self.params_tstat), self.df)
|
|
|
|
bcrit = -se * _invt(self.alpha / 2, self.df)
|
|
|
|
else:
|
|
|
|
self.params_pvalue = 2. * _cdfnorm(-abs(self.params_tstat))
|
|
|
|
bcrit = -se * _invnorm(self.alpha / 2)
|
|
|
|
# end
|
|
|
|
self.params_ci = np.vstack((self.params + bcrit, self.params - bcrit))
|
|
|
|
|
|
|
|
self.mu = gammai
|
|
|
|
self.eta = _logit(gammai)
|
|
|
|
self.X = X
|
|
|
|
[dev, dl, d2l, p] = self.loglike(tb, y, X, z, z1, numout=4)
|
|
|
|
self.theta = theta
|
|
|
|
self.beta = beta
|
|
|
|
self.gamma = gammai
|
|
|
|
self.residual = res.T
|
|
|
|
self.residualD = np.sign(self.residual) * sqrt(-2 * np.log(p))
|
|
|
|
self.deviance = dev
|
|
|
|
self.deviance_null = nulldev
|
|
|
|
self.d2L = d2l
|
|
|
|
self.dL = dl.T
|
|
|
|
self.dispersionfit = 1
|
|
|
|
self.dispersion = 1
|
|
|
|
self.R2 = R2
|
|
|
|
self.R2adj = R2adj
|
|
|
|
self.numiter = i
|
|
|
|
self.converged = i < self.maxiter
|
|
|
|
self.note = ''
|
|
|
|
self.date = now()
|
|
|
|
|
|
|
|
if (self.verbose):
|
|
|
|
self.summary()
|
|
|
|
|
|
|
|
def compare(self, object2):
|
|
|
|
""" Compare small LOGIT versus large one
|
|
|
|
|
|
|
|
CALL [pvalue] = compare(object2)
|
|
|
|
|
|
|
|
The standard hypothesis test of a larger linear regression
|
|
|
|
model against a smaller one. The standard Chi2-test is used.
|
|
|
|
The output is the p-value, the residuals from the smaller
|
|
|
|
model, and the residuals from the larger model.
|
|
|
|
|
|
|
|
See also fitls
|
|
|
|
"""
|
|
|
|
|
|
|
|
try:
|
|
|
|
if self.numvar > object2.numvar:
|
|
|
|
devL = self.deviance
|
|
|
|
nL = self.numvar
|
|
|
|
dfL = self.df
|
|
|
|
Al = self.X
|
|
|
|
disprsn = self.dispersionfit
|
|
|
|
devs = object2.deviance
|
|
|
|
ns = object2.numvar
|
|
|
|
dfs = object2.df
|
|
|
|
As = object2.X
|
|
|
|
else:
|
|
|
|
devL = object2.deviance
|
|
|
|
nL = object2.numvar
|
|
|
|
dfL = object2.df
|
|
|
|
Al = object2.X
|
|
|
|
disprsn = object2.dispersionfit
|
|
|
|
devs = self.deviance
|
|
|
|
ns = self.numvar
|
|
|
|
dfs = self.df
|
|
|
|
As = self.X
|
|
|
|
# end
|
|
|
|
|
|
|
|
if (((As - np.dot(Al * np.linalg.lstsq(Al, As))) > 500 * np.finfo(float).eps).any() or
|
|
|
|
object2.family != self.family or object2.link != self.link):
|
|
|
|
warnings.warn('Small model not included in large model,' +
|
|
|
|
' result is rubbish!')
|
|
|
|
|
|
|
|
except:
|
|
|
|
raise ValueError('Apparently not a valid regression object')
|
|
|
|
|
|
|
|
pmq = np.abs(nL - ns)
|
|
|
|
print(' ')
|
|
|
|
print(' Analysis of Deviance')
|
|
|
|
if False: # options.estdispersn
|
|
|
|
localstat = abs(devL - devs) / disprsn / pmq
|
|
|
|
# localpvalue = 1-cdff(localstat, pmq, dfL)
|
|
|
|
# print('Model DF Residual deviance F-stat Pr(>F)')
|
|
|
|
else:
|
|
|
|
localstat = abs(devL - devs) / disprsn
|
|
|
|
localpvalue = 1 - _cdfchi2(localstat, pmq)
|
|
|
|
print('Model DF Residual deviance Chi2-stat ' +
|
|
|
|
' Pr(>Chi2)')
|
|
|
|
# end
|
|
|
|
|
|
|
|
print('Small %d %12.4f %12.4f %12.4f' %
|
|
|
|
(dfs, devs, localstat, localpvalue))
|
|
|
|
print('Full %d %12.4f' % (dfL, devL))
|
|
|
|
print(' ')
|
|
|
|
|
|
|
|
return localpvalue
|
|
|
|
|
|
|
|
def anode(self):
|
|
|
|
print(' ')
|
|
|
|
print(' Analysis of Deviance')
|
|
|
|
if False: # %options.estdispersn
|
|
|
|
localstat = abs(self.deviance_null - self.deviance) / \
|
|
|
|
self.dispersionfit / (self.numvar - 1)
|
|
|
|
localpvalue = 1 - _cdff(localstat, self.numvar - 1, self.df)
|
|
|
|
print(
|
|
|
|
'Model DF Residual deviance F-stat Pr(>F)')
|
|
|
|
else:
|
|
|
|
localstat = abs(
|
|
|
|
self.deviance_null - self.deviance) / self.dispersionfit
|
|
|
|
localpvalue = 1 - _cdfchi2(localstat, self.numvar - 1)
|
|
|
|
print('Model DF Residual deviance Chi2-stat' +
|
|
|
|
' Pr(>Chi2)')
|
|
|
|
# end
|
|
|
|
|
|
|
|
print('Null %d %12.4f %12.4f %12.4f' %
|
|
|
|
(self.df_null, self.deviance_null, localstat, localpvalue))
|
|
|
|
print('Full %d %12.4f' % (self.df, self.deviance))
|
|
|
|
print(' ')
|
|
|
|
|
|
|
|
print(' R2 = %2.4f, R2adj = %2.4f' % (self.R2, self.R2adj))
|
|
|
|
print(' ')
|
|
|
|
return localpvalue
|
|
|
|
|
|
|
|
def summary(self):
|
|
|
|
txtlink = self.link
|
|
|
|
|
|
|
|
print('Call:')
|
|
|
|
print('reglogit(formula = %s(Pr(grp(y)<=i)) ~ theta_i+beta*x, family = %s)' %
|
|
|
|
(txtlink, self.family))
|
|
|
|
print(' ')
|
|
|
|
print('Deviance Residuals:')
|
|
|
|
m, q1, me, q3, M = np.percentile(
|
|
|
|
self.residualD, q=[0, 25, 50, 75, 100])
|
|
|
|
print(' Min 1Q Median 3Q Max ')
|
|
|
|
print('%2.4f %2.4f %2.4f %2.4f %2.4f' %
|
|
|
|
(m, q1, me, q3, M))
|
|
|
|
print(' ')
|
|
|
|
print(' Coefficients:')
|
|
|
|
if False: # %options.estdispersn
|
|
|
|
print(
|
|
|
|
' Estimate Std. Error t value Pr(>|t|)')
|
|
|
|
else:
|
|
|
|
print(
|
|
|
|
' Estimate Std. Error z value Pr(>|z|)')
|
|
|
|
# end
|
|
|
|
e, s, z, p = (self.params, self.params_std, self.params_tstat,
|
|
|
|
self.params_pvalue)
|
|
|
|
for i in range(self.numk):
|
|
|
|
print(
|
|
|
|
'theta_%d %2.4f %2.4f %2.4f %2.4f' %
|
|
|
|
(i, e[i], s[i], z[i], p[i]))
|
|
|
|
|
|
|
|
for i in range(self.numk, self.numvar):
|
|
|
|
print(
|
|
|
|
' beta_%d %2.4f %2.4f %2.4f %2.4f\n' %
|
|
|
|
(i - self.numk, e[i], s[i], z[i], p[i]))
|
|
|
|
|
|
|
|
print(' ')
|
|
|
|
print('(Dispersion parameter for %s family taken to be %2.2f)' %
|
|
|
|
(self.family, self.dispersionfit))
|
|
|
|
print(' ')
|
|
|
|
if True: # %options.constant
|
|
|
|
print(' Null deviance: %2.4f on %d degrees of freedom' %
|
|
|
|
(self.deviance_null, self.df_null))
|
|
|
|
# end
|
|
|
|
print('Residual deviance: %2.4f on %d degrees of freedom' %
|
|
|
|
(self.deviance, self.df))
|
|
|
|
|
|
|
|
self.anode()
|
|
|
|
|
|
|
|
#end % summary
|
|
|
|
|
|
|
|
def predict(self, Xnew=None, alpha=0.05, fulloutput=False):
|
|
|
|
"""LOGIT/PREDICT Predict from a fitted LOGIT object
|
|
|
|
|
|
|
|
CALL [y,ylo,yup] = predict(Xnew,options)
|
|
|
|
|
|
|
|
y = predicted value
|
|
|
|
ylo,yup = 100(1-alpha)% confidence interval for y
|
|
|
|
|
|
|
|
Xnew = new covariate
|
|
|
|
options = options struct defining the calculation
|
|
|
|
.alpha : confidence coefficient (default 0.05)
|
|
|
|
.size : size if binomial family (default 1).
|
|
|
|
"""
|
|
|
|
|
|
|
|
[_mx, nx] = self.X.shape
|
|
|
|
if Xnew is None:
|
|
|
|
Xnew = self.X
|
|
|
|
else:
|
|
|
|
Xnew = np.atleast_2d(Xnew)
|
|
|
|
notnans = np.flatnonzero(1 - (1 - np.isfinite(Xnew)).any(axis=1))
|
|
|
|
Xnew = Xnew[notnans, :]
|
|
|
|
|
|
|
|
[n, p] = Xnew.shape
|
|
|
|
|
|
|
|
if p != nx:
|
|
|
|
raise ValueError('Number of covariates must match the number' +
|
|
|
|
' of regression coefficients')
|
|
|
|
|
|
|
|
nz = self.numk - 1
|
|
|
|
one = ones((n, 1))
|
|
|
|
if (nx > 0):
|
|
|
|
eta = np.dot(Xnew, self.beta).reshape(-1, 1) + self.theta
|
|
|
|
else:
|
|
|
|
eta = one * self.theta
|
|
|
|
# end
|
|
|
|
y = np.diff(
|
|
|
|
np.hstack((zeros((n, 1)), _logitinv(eta), one)), n=1, axis=1)
|
|
|
|
if fulloutput:
|
|
|
|
eps = np.finfo(float).eps
|
|
|
|
pcov = self.params_cov
|
|
|
|
if (nx > 0):
|
|
|
|
np1 = pcov.shape[0]
|
|
|
|
|
|
|
|
[U, S, V] = np.linalg.svd(pcov, 0)
|
|
|
|
# %squareroot of pcov
|
|
|
|
R = np.dot(U, np.dot(np.diag(sqrt(S)), V))
|
|
|
|
ib = np.r_[0, nz:np1]
|
|
|
|
|
|
|
|
# Var(eta_i) = var(theta_i+Xnew*b)
|
|
|
|
vareta = zeros((n, nz))
|
|
|
|
u = np.hstack((one, Xnew))
|
|
|
|
for i in range(nz):
|
|
|
|
ib[0] = i
|
|
|
|
vareta[:, i] = np.maximum(
|
|
|
|
((np.dot(u, R[ib][:, ib])) ** 2).sum(axis=1), eps)
|
|
|
|
# end
|
|
|
|
else:
|
|
|
|
vareta = np.diag(pcov)
|
|
|
|
# end
|
|
|
|
crit = -_invnorm(alpha / 2)
|
|
|
|
|
|
|
|
ecrit = crit * sqrt(vareta)
|
|
|
|
mulo = _logitinv(eta - ecrit)
|
|
|
|
muup = _logitinv(eta + ecrit)
|
|
|
|
ylo1 = np.diff(np.hstack((zeros((n, 1)), mulo, one)), n=1, axis=1)
|
|
|
|
yup1 = np.diff(np.hstack((zeros((n, 1)), muup, one)), n=1, axis=1)
|
|
|
|
|
|
|
|
ylo = np.minimum(ylo1, yup1)
|
|
|
|
yup = np.maximum(ylo1, yup1)
|
|
|
|
|
|
|
|
for i in range(1, nz): # = 2:self.numk-1
|
|
|
|
yup[:, i] = np.vstack(
|
|
|
|
(yup[:, i], muup[:, i] - mulo[:, i - 1])).max(axis=0)
|
|
|
|
# end
|
|
|
|
return y, ylo, yup
|
|
|
|
return y
|
|
|
|
|
|
|
|
def loglike(self, beta, y, x, z, z1, numout=3):
|
|
|
|
"""
|
|
|
|
[dev, dl, d2l, p] = loglike( y ,x,beta,z,z1)
|
|
|
|
Calculates likelihood for the ordinal logistic regression model.
|
|
|
|
"""
|
|
|
|
# Author: Gordon K. Smyth <gks@maths.uq.oz.au>
|
|
|
|
zx = np.hstack((z, x))
|
|
|
|
z1x = np.hstack((z1, x))
|
|
|
|
g = _logitinv(np.dot(zx, beta)).reshape((-1, 1))
|
|
|
|
g1 = _logitinv(np.dot(z1x, beta)).reshape((-1, 1))
|
|
|
|
g = np.maximum(y == y.max(), g)
|
|
|
|
g1 = np.minimum(y > y.min(), g1)
|
|
|
|
|
|
|
|
p = g - g1
|
|
|
|
dev = -2 * np.log(p).sum()
|
|
|
|
|
|
|
|
"""[dl, d2l] = derivatives of loglike(beta, y, x, z, z1)
|
|
|
|
% Called by logistic_regression. Calculates derivates of the
|
|
|
|
% log-likelihood for ordinal logistic regression model.
|
|
|
|
"""
|
|
|
|
# Author: Gordon K. Smyth <gks@maths.uq.oz.au>
|
|
|
|
# Description: Derivates of log-likelihood in logistic regression
|
|
|
|
|
|
|
|
# first derivative
|
|
|
|
v = g * (1 - g) / p
|
|
|
|
v1 = g1 * (1 - g1) / p
|
|
|
|
dlogp = np.hstack((((v * z) - (v1 * z1)), ((v - v1) * x)))
|
|
|
|
dl = np.sum(dlogp, axis=0)
|
|
|
|
|
|
|
|
# second derivative
|
|
|
|
w = v * (1 - 2 * g)
|
|
|
|
w1 = v1 * (1 - 2 * g1)
|
|
|
|
d2l = np.dot(zx.T, (w * zx)) - np.dot(
|
|
|
|
z1x.T, (w1 * z1x)) - np.dot(dlogp.T, dlogp)
|
|
|
|
|
|
|
|
if numout == 4:
|
|
|
|
return dev, dl, d2l, p
|
|
|
|
else:
|
|
|
|
return dev, dl, d2l
|
|
|
|
#end %function
|
|
|
|
|
|
|
|
|
|
|
|
def _test_dispersion_idx():
|
|
|
|
import wafo.data
|
|
|
|
xn = wafo.data.sea()
|
|
|
|
t, data = xn.T
|
|
|
|
Ie = findpot(data, t, 0, 5)
|
|
|
|
di, _u, _ok_u = dispersion_idx(data[Ie], t[Ie], tb=100)
|
|
|
|
di.plot() # a threshold around 1 seems appropriate.
|
|
|
|
di.show()
|
|
|
|
|
|
|
|
|
|
|
|
def _test_findpot():
|
|
|
|
import pylab
|
|
|
|
import wafo.data
|
|
|
|
from wafo.misc import findtc
|
|
|
|
x = wafo.data.sea()
|
|
|
|
t, data = x[:, :].T
|
|
|
|
itc, _iv = findtc(data, 0, 'dw')
|
|
|
|
ytc, ttc = data[itc], t[itc]
|
|
|
|
ymin = 2 * data.std()
|
|
|
|
tmin = 10 # sec
|
|
|
|
I = findpot(data, t, ymin, tmin)
|
|
|
|
yp, tp = data[I], t[I]
|
|
|
|
Ie = findpot(yp, tp, ymin, tmin)
|
|
|
|
ye, te = yp[Ie], tp[Ie]
|
|
|
|
pylab.plot(t, data, ttc, ytc, 'ro', t,
|
|
|
|
zeros(len(t)), ':', te, ye, 'kx', tp, yp, '+')
|
|
|
|
pylab.show()
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
def _test_reslife():
|
|
|
|
import wafo
|
|
|
|
R = wafo.stats.genpareto.rvs(0.1, 2, 2, size=100)
|
|
|
|
mrl = reslife(R, nu=20)
|
|
|
|
mrl.plot()
|
|
|
|
|
|
|
|
|
|
|
|
def test_reglogit():
|
|
|
|
y = np.array([1, 1, 2, 1, 3, 2, 3, 2, 3, 3]).reshape(-1, 1)
|
|
|
|
x = np.arange(1, 11).reshape(-1, 1)
|
|
|
|
b = RegLogit()
|
|
|
|
b.fit(y, x)
|
|
|
|
# b.display() # members and methods
|
|
|
|
|
|
|
|
b.summary()
|
|
|
|
[mu, plo, pup] = b.predict(fulloutput=True) # @UnusedVariable
|
|
|
|
pass
|
|
|
|
# plot(x,mu,'g',x,plo,'r:',x,pup,'r:')
|
|
|
|
|
|
|
|
|
|
|
|
def test_reglogit2():
|
|
|
|
n = 40
|
|
|
|
x = np.sort(5 * np.random.rand(n, 1) - 2.5, axis=0)
|
|
|
|
y = (np.cos(x) > 2 * np.random.rand(n, 1) - 1)
|
|
|
|
b = RegLogit()
|
|
|
|
b.fit(y, x)
|
|
|
|
# b.display() # members and methods
|
|
|
|
b.summary()
|
|
|
|
[mu, plo, pup] = b.predict(fulloutput=True)
|
|
|
|
import matplotlib.pyplot as pl
|
|
|
|
pl.plot(x, mu, 'g', x, plo, 'r:', x, pup, 'r:')
|
|
|
|
pl.show()
|
|
|
|
|
|
|
|
|
|
|
|
def test_sklearn0():
|
|
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
|
from sklearn import datasets # @UnusedImport
|
|
|
|
|
|
|
|
# FIXME: the iris dataset has only 4 features!
|
|
|
|
# iris = datasets.load_iris()
|
|
|
|
# X = iris.data
|
|
|
|
# y = iris.target
|
|
|
|
|
|
|
|
X = np.sort(5 * np.random.rand(40, 1) - 2.5, axis=0)
|
|
|
|
y = (2 * (np.cos(X) > 2 * np.random.rand(40, 1) - 1) - 1).ravel()
|
|
|
|
|
|
|
|
score = []
|
|
|
|
# Set regularization parameter
|
|
|
|
cvals = np.logspace(-1, 1, 5)
|
|
|
|
for C in cvals:
|
|
|
|
clf_LR = LogisticRegression(C=C, penalty='l2')
|
|
|
|
clf_LR.fit(X, y)
|
|
|
|
score.append(clf_LR.score(X, y))
|
|
|
|
|
|
|
|
#plot(cvals, score)
|
|
|
|
|
|
|
|
|
|
|
|
def test_sklearn():
|
|
|
|
X = np.sort(5 * np.random.rand(40, 1) - 2.5, axis=0)
|
|
|
|
y = (2 * (np.cos(X) > 2 * np.random.rand(40, 1) - 1) - 1).ravel()
|
|
|
|
from sklearn.svm import SVR
|
|
|
|
|
|
|
|
#
|
|
|
|
# look at the results
|
|
|
|
import pylab as pl
|
|
|
|
pl.scatter(X, .5 * np.cos(X) + 0.5, c='k', label='True model')
|
|
|
|
pl.hold('on')
|
|
|
|
cvals = np.logspace(-1, 3, 20)
|
|
|
|
score = []
|
|
|
|
for c in cvals:
|
|
|
|
svr_rbf = SVR(kernel='rbf', C=c, gamma=0.1, probability=True)
|
|
|
|
svrf = svr_rbf.fit(X, y)
|
|
|
|
y_rbf = svrf.predict(X)
|
|
|
|
score.append(svrf.score(X, y))
|
|
|
|
pl.plot(X, y_rbf, label='RBF model c=%g' % c)
|
|
|
|
pl.xlabel('data')
|
|
|
|
pl.ylabel('target')
|
|
|
|
pl.title('Support Vector Regression')
|
|
|
|
pl.legend()
|
|
|
|
pl.show()
|
|
|
|
|
|
|
|
|
|
|
|
def test_sklearn1():
|
|
|
|
X = np.sort(5 * np.random.rand(40, 1) - 2.5, axis=0)
|
|
|
|
y = (2 * (np.cos(X) > 2 * np.random.rand(40, 1) - 1) - 1).ravel()
|
|
|
|
from sklearn.svm import SVR
|
|
|
|
|
|
|
|
# cvals= np.logspace(-1,4,10)
|
|
|
|
svr_rbf = SVR(kernel='rbf', C=1e4, gamma=0.1, probability=True)
|
|
|
|
svr_lin = SVR(kernel='linear', C=1e4, probability=True)
|
|
|
|
svr_poly = SVR(kernel='poly', C=1e4, degree=2, probability=True)
|
|
|
|
y_rbf = svr_rbf.fit(X, y).predict(X)
|
|
|
|
y_lin = svr_lin.fit(X, y).predict(X)
|
|
|
|
y_poly = svr_poly.fit(X, y).predict(X)
|
|
|
|
|
|
|
|
#
|
|
|
|
# look at the results
|
|
|
|
import pylab as pl
|
|
|
|
pl.scatter(X, .5 * np.cos(X) + 0.5, c='k', label='True model')
|
|
|
|
pl.hold('on')
|
|
|
|
pl.plot(X, y_rbf, c='g', label='RBF model')
|
|
|
|
pl.plot(X, y_lin, c='r', label='Linear model')
|
|
|
|
pl.plot(X, y_poly, c='b', label='Polynomial model')
|
|
|
|
pl.xlabel('data')
|
|
|
|
pl.ylabel('target')
|
|
|
|
pl.title('Support Vector Regression')
|
|
|
|
pl.legend()
|
|
|
|
pl.show()
|
|
|
|
|
|
|
|
|
|
|
|
def test_doctstrings():
|
|
|
|
#_test_dispersion_idx()
|
|
|
|
import doctest
|
|
|
|
doctest.testmod()
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# test_reglogit2()
|
|
|
|
test_doctstrings()
|