You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1424 lines
42 KiB
Python

from __future__ import division
import warnings
from wafo.containers import PlotData
from wafo.misc import findextrema
from scipy import special
import numpy as np
from numpy import inf
from numpy import atleast_1d, nan, ndarray, sqrt, vstack, ones, where, zeros
# , reshape, repeat, product
from numpy import arange, floor, linspace, asarray
from time import gmtime, strftime
__all__ = [
'edf', 'edfcnd', 'reslife', 'dispersion_idx', 'decluster', 'findpot',
'declustering_time', 'interexceedance_times', 'extremal_idx']
arr = asarray
def now():
"""
Return current date and time as a string
"""
return strftime("%a, %d %b %Y %H:%M:%S", gmtime())
def valarray(shape, value=nan, typecode=None):
"""Return an array of all value.
"""
out = ones(shape, dtype=bool) * value
if typecode is not None:
out = out.astype(typecode)
if not isinstance(out, ndarray):
out = arr(out)
return out
def _cdff(x, dfn, dfd):
return special.fdtr(dfn, dfd, x)
def _cdft(x, df):
return special.stdtr(df, x)
def _invt(q, df):
return special.stdtrit(df, q)
def _cdfchi2(x, df):
return special.chdtr(df, x)
def _invchi2(q, df):
return special.chdtri(df, q)
def _cdfnorm(x):
return special.ndtr(x)
def _invnorm(q):
return special.ndtri(q)
def edf(x, method=2):
"""
Returns Empirical Distribution Function (EDF).
Parameters
----------
x : array-like
data vector
method : integer scalar
1. Interpolation so that F(X_(k)) == (k-0.5)/n.
2. Interpolation so that F(X_(k)) == k/(n+1). (default)
3. The empirical distribution. F(X_(k)) = k/n
Example
-------
>>> import wafo.stats as ws
>>> x = np.linspace(0,6,200)
>>> R = ws.rayleigh.rvs(scale=2,size=100)
>>> F = ws.edf(R)
>>> h = F.plot()
See also edf, pdfplot, cumtrapz
"""
z = atleast_1d(x)
z.sort()
n = len(z)
if method == 1:
e_cdf = arange(0.5, n) / n
elif method == 3:
e_cdf = arange(1, n + 1) / n
else:
e_cdf = arange(1, n + 1) / (n + 1)
return PlotData(e_cdf, z, xlab='x', ylab='F(x)', plotmethod='step')
def edfcnd(x, c=None, method=2):
"""
Returns empirical Distribution Function CoNDitioned that X>=c (EDFCND).
Parameters
----------
x : array-like
data vector
method : integer scalar
1. Interpolation so that cdf(X_(k)) == (k-0.5)/n.
2. Interpolation so that cdf(X_(k)) == k/(n+1). (default)
3. The empirical distribution. cdf(X_(k)) = k/n
Example
-------
>>> import wafo.stats as ws
>>> x = np.linspace(0,6,200)
>>> R = ws.rayleigh.rvs(scale=2,size=100)
>>> Fc = ws.edfcnd(R, 1)
>>> hc = Fc.plot()
>>> cdf = ws.edf(R)
>>> h = cdf.plot()
See also edf, pdfplot, cumtrapz
"""
z = atleast_1d(x)
if c is None:
c = floor(min(z.min(), 0))
cdf = edf(z[c <= z], method=method)
if - inf < c:
cdf.labels.ylab = 'F(x| X>=%g)' % c
return cdf
def _check_nmin(nmin, n):
nmin = max(nmin, 1)
if 2 * nmin > n:
warnings.warn('nmin possibly too large!')
return nmin
def _check_umin_umax(data, umin, umax, nmin):
sd = np.sort(data)
sdmax, sdmin = sd[-nmin], sd[0]
umax = sdmax if umax is None else min(umax, sdmax)
umin = sdmin if umin is None else max(umin, sdmin)
return umin, umax
def _check_nu(nu, nmin, n):
if nu is None:
nu = min(n - nmin, 100)
return nu
def reslife(data, u=None, umin=None, umax=None, nu=None, nmin=3, alpha=0.05,
plotflag=False):
"""
Return Mean Residual Life, i.e., mean excesses vs thresholds
Parameters
---------
data : array_like
vector of data of length N.
u : array-like
threshold values (default linspace(umin, umax, nu))
umin, umax : real scalars
Minimum and maximum threshold, respectively
(default min(data), max(data)).
nu : scalar integer
number of threshold values (default min(N-nmin,100))
nmin : scalar integer
Minimum number of extremes to include. (Default 3).
alpha : real scalar
Confidence coefficient (default 0.05)
plotflag: bool
Returns
-------
mrl : PlotData object
Mean residual life values, i.e., mean excesses over thresholds, u.
Notes
-----
RESLIFE estimate mean excesses over thresholds. The purpose of MRL is
to determine the threshold where the upper tail of the data can be
approximated with the generalized Pareto distribution (GPD). The GPD is
appropriate for the tail, if the MRL is a linear function of the
threshold, u. Theoretically in the GPD model
E(X-u0|X>u0) = s0/(1+k)
E(X-u |X>u) = s/(1+k) = (s0 -k*u)/(1+k) for u>u0
where k,s is the shape and scale parameter, respectively.
s0 = scale parameter for threshold u0<u.
Example
-------
>>> import wafo
>>> R = wafo.stats.genpareto.rvs(0.1,2,2,size=100)
>>> mrl = reslife(R,nu=20)
>>> h = mrl.plot()
See also
---------
genpareto
fitgenparrange, disprsnidx
"""
if u is None:
n = len(data)
nmin = _check_nmin(nmin, n)
umin, umax = _check_umin_umax(data, umin, umax, nmin)
nu = _check_nu(nu, nmin, n)
u = linspace(umin, umax, nu)
nu = len(u)
# mrl1 = valarray(nu)
# srl = valarray(nu)
# num = valarray(nu)
def mean_and_std(data1):
return data1.mean(), data1.std(), data1.size
dat = arr(data)
tmp = arr([mean_and_std(dat[dat > tresh] - tresh) for tresh in u.tolist()])
mrl, srl, num = tmp.T
p = 1 - alpha
alpha2 = alpha / 2
# Approximate P% confidence interval
# z_a = -invnorm(alpha2); % known mean
z_a = -_invt(alpha2, num - 1) # unknown mean
mrlu = mrl + z_a * srl / sqrt(num)
mrll = mrl - z_a * srl / sqrt(num)
title_txt = 'Mean residual life with %d%s CI' % (100 * p, '%')
res = PlotData(mrl, u, xlab='Threshold',
ylab='Mean Excess', title=title_txt)
res.workspace = dict(
numdata=num, umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha)
res.children = [
PlotData(vstack([mrll, mrlu]).T, u, xlab='Threshold', title=title_txt)]
res.plot_args_children = [':r']
if plotflag:
res.plot()
return res
def dispersion_idx(
data, t=None, u=None, umin=None, umax=None, nu=None, nmin=10, tb=1,
alpha=0.05, plotflag=False):
"""Return Dispersion Index vs threshold
Parameters
----------
data, ti : array_like
data values and sampled times, respectively.
u : array-like
threshold values (default linspace(umin, umax, nu))
umin, umax : real scalars
Minimum and maximum threshold, respectively
(default min(data), max(data)).
nu : scalar integer
number of threshold values (default min(N-nmin,100))
nmin : scalar integer
Minimum number of extremes to include. (Default 10).
tb : Real scalar
Block period (same unit as the sampled times) (default 1)
alpha : real scalar
Confidence coefficient (default 0.05)
plotflag: bool
Returns
-------
DI : PlotData object
Dispersion index
b_u : real scalar
threshold where the number of exceedances in a fixed period (Tb) is
consistent with a Poisson process.
ok_u : array-like
all thresholds where the number of exceedances in a fixed period (Tb)
is consistent with a Poisson process.
Notes
------
DISPRSNIDX estimate the Dispersion Index (DI) as function of threshold.
DI measures the homogenity of data and the purpose of DI is to determine
the threshold where the number of exceedances in a fixed period (Tb) is
consistent with a Poisson process. For a Poisson process the DI is one.
Thus the threshold should be so high that DI is not significantly
different from 1.
The Poisson hypothesis is not rejected if the estimated DI is between:
chi2(alpha/2, M-1)/(M-1)< DI < chi^2(1 - alpha/2, M-1 }/(M - 1)
where M is the total number of fixed periods/blocks -generally
the total number of years in the sample.
Example
-------
>>> import wafo.data
>>> xn = wafo.data.sea()
>>> t, data = xn.T
>>> Ie = findpot(data,t,0,5);
>>> di, u, ok_u = dispersion_idx(data[Ie],t[Ie],tb=100)
>>> h = di.plot() # a threshold around 1 seems appropriate.
>>> round(u*100)/100
1.03
vline(u)
See also
--------
reslife,
fitgenparrange,
extremal_idx
References
----------
Ribatet, M. A.,(2006),
A User's Guide to the POT Package (Version 1.0)
month = {August},
url = {http://cran.r-project.org/}
Cunnane, C. (1979) Note on the poisson assumption in
partial duration series model. Water Resource Research, 15\bold{(2)}
:489--494.}
"""
def _find_appropriate_threshold(u, di, di_low, di_up):
k1, = np.where((di_low < di) & (di < di_up))
if len(k1) > 0:
ok_u = u[k1]
b_di = di[k1].mean() < di[k1]
k = b_di.argmax()
b_u = ok_u[k]
else:
b_u = ok_u = None
return b_u, ok_u
n = len(data)
if t is None:
ti = arange(n)
else:
ti = arr(t) - min(t)
t1 = np.empty(ti.shape, dtype=int)
t1[:] = np.floor(ti / tb)
if u is None:
nmin = _check_nmin(nmin, n)
umin, umax = _check_umin_umax(data, umin, umax, nmin)
nu = _check_nu(nu, nmin, n)
u = linspace(umin, umax, nu)
nu = len(u)
di = np.zeros(nu)
d = arr(data)
mint = int(min(t1)) # should be 0.
maxt = int(max(t1))
M = maxt - mint + 1
occ = np.zeros(M)
for ix, tresh in enumerate(u):
excess = (d > tresh)
lambda_ = excess.sum() / M
for block in range(M):
occ[block] = sum(excess[t1 == block])
di[ix] = occ.var() / lambda_
p = 1.0 - alpha
di_low = _invchi2(1 - alpha / 2, M - 1) / (M - 1)
di_up = _invchi2(alpha / 2, M - 1) / (M - 1)
b_u, ok_u = _find_appropriate_threshold(u, di, di_low, di_up)
ci_txt = '{0:d}{1} CI'.format(100 * p, '%')
title_txt = 'Dispersion Index plot'
res = PlotData(di, u, title=title_txt,
labx='Threshold', laby='Dispersion Index')
res.workspace = dict(umin=umin, umax=umax, nu=nu, nmin=nmin, alpha=alpha)
res.children = [
PlotData(vstack([di_low * ones(nu), di_up * ones(nu)]).T, u,
xlab='Threshold', title=ci_txt)]
res.plot_args_children = ['--r']
if plotflag:
res.plot(di)
return res, b_u, ok_u
def decluster(data, t=None, thresh=None, tmin=1):
"""
Return declustered peaks over threshold values
Parameters
----------
data, t : array-like
data-values and sampling-times, respectively.
thresh : real scalar
minimum threshold for levels in data.
tmin : real scalar
minimum distance to another peak [same unit as t] (default 1)
Returns
-------
ev, te : ndarray
extreme values and its corresponding sampling times, respectively,
i.e., all data > thresh which are at least tmin distance apart.
Example
-------
>>> import pylab
>>> import wafo.data
>>> from wafo.misc import findtc
>>> x = wafo.data.sea()
>>> t, data = x[:400,:].T
>>> itc, iv = findtc(data,0,'dw')
>>> ytc, ttc = data[itc], t[itc]
>>> ymin = 2*data.std()
>>> tmin = 10 # sec
>>> [ye, te] = decluster(ytc,ttc, ymin,tmin);
>>> h = pylab.plot(t,data,ttc,ytc,'ro',t,zeros(len(t)),':',te,ye,'k.')
See also
--------
fitgenpar, findpot, extremalidx
"""
if t is None:
t = np.arange(len(data))
i = findpot(data, t, thresh, tmin)
return data[i], t[i]
def _remove_index_to_data_too_close_to_each_other(ix_e, is_too_small, di_e, ti_e, tmin):
is_too_close = np.hstack((is_too_small[0], is_too_small[:-1] | is_too_small[1:],
is_too_small[-1]))
# Find opening (no) and closing (nc) index for data beeing to close:
iy = findextrema(np.hstack([0, 0, is_too_small, 0]))
no = iy[:2] - 1
nc = iy[1::2]
for start, stop in zip(no, nc):
iz = slice(start, stop)
i_ok = _find_ok_peaks(di_e[iz], ti_e[iz], tmin)
if len(i_ok):
is_too_close[start + i_ok] = 0
# Remove data which is too close to other data.
if is_too_close.any():
i_ok, = where(1 - is_too_close)
ix_e = ix_e[i_ok]
return ix_e
def findpot(data, t=None, thresh=None, tmin=1):
"""
Retrun indices to Peaks over threshold values
Parameters
----------
data, t : array-like
data-values and sampling-times, respectively.
thresh : real scalar
minimum threshold for levels in data.
tmin : real scalar
minimum distance to another peak [same unit as t] (default 1)
Returns
-------
ix_e : ndarray
indices to extreme values, i.e., all data > tresh which are at least
tmin distance apart.
Example
-------
>>> import pylab
>>> import wafo.data
>>> from wafo.misc import findtc
>>> x = wafo.data.sea()
>>> t, data = x.T
>>> itc, iv = findtc(data,0,'dw')
>>> ytc, ttc = data[itc], t[itc]
>>> ymin = 2*data.std()
>>> tmin = 10 # sec
>>> i = findpot(data, t, ymin, tmin)
>>> yp, tp = data[i], t[i]
>>> ix_e = findpot(yp, tp, ymin,tmin)
>>> ye, te = yp[ix_e], tp[ix_e]
>>> h = pylab.plot(t,data,ttc,ytc,'ro',
... t,zeros(len(t)),':',
... te, ye,'k.',tp,yp,'+')
See also
--------
fitgenpar, decluster, extremalidx
"""
data = arr(data)
if t is None:
t = np.arange(len(data))
else:
t = arr(t)
ix_e, = where(data > thresh)
di_e = data[ix_e]
ti_e = t[ix_e]
if len(di_e) <= 1:
return ix_e
dt = np.diff(ti_e)
not_sorted = np.any(dt < 0)
if not_sorted:
i = np.argsort(ti_e)
ti_e = ti_e[i]
ix_e = ix_e[i]
di_e = di_e[i]
dt = np.diff(ti_e)
is_too_small = (dt <= tmin)
if np.any(is_too_small):
ix_e = _remove_index_to_data_too_close_to_each_other(ix_e, is_too_small, di_e, ti_e, tmin)
return ix_e
def _find_ok_peaks(y, t, t_min):
"""
Return indices to the largest maxima that are at least t_min distance apart.
"""
num_y = len(y)
i = np.argsort(-y) # sort in descending order
tis = t[i]
o_order = zeros(num_y, dtype=int)
o_order[i] = range(num_y) # indices to the variables original location
is_too_close = zeros(num_y, dtype=bool)
pool = zeros((num_y, 2))
t_range = np.hstack([-t_min, t_min])
k = 0
for i, ti in enumerate(tis):
is_too_close[i] = np.any((pool[:k, 0] <= ti) & (ti <= pool[:k, 1]))
if not is_too_close[i]:
pool[k] = ti + t_range
k += 1
i_ok, = where(1 - is_too_close[o_order])
return i_ok
def declustering_time(t):
"""
Returns minimum distance between clusters.
Parameters
----------
t : array-like
sampling times for data.
Returns
-------
tc : real scalar
minimum distance between clusters.
Example
-------
>>> import wafo.data
>>> x = wafo.data.sea()
>>> t, data = x[:400,:].T
>>> Ie = findpot(data,t,0,5)
>>> tc = declustering_time(Ie)
>>> tc
21
"""
t0 = arr(t)
nt = len(t0)
if nt < 2:
return arr([])
ti = interexceedance_times(t0)
ei = extremal_idx(ti)
if ei == 1:
tc = ti.min()
else:
i = int(np.floor(nt * ei))
sti = -np.sort(-ti)
tc = sti[min(i, nt - 2)] # % declustering time
return tc
def interexceedance_times(t):
"""
Returns interexceedance times of data
Parameters
----------
t : array-like
sampling times for data.
Returns
-------
ti : ndarray
interexceedance times
Example
-------
>>> t = [1,2,5,10]
>>> interexceedance_times(t)
array([1, 3, 5])
"""
return np.diff(np.sort(t))
def extremal_idx(ti):
"""
Returns Extremal Index measuring the dependence of data
Parameters
----------
ti : array-like
interexceedance times for data.
Returns
-------
ei : real scalar
Extremal index.
Notes
-----
The Extremal Index (EI) is one if the data are independent and less than
one if there are some dependence. The extremal index can also be intepreted
as the reciprocal of the mean cluster size.
Example
-------
>>> import wafo.data
>>> x = wafo.data.sea()
>>> t, data = x[:400,:].T
>>> Ie = findpot(data,t,0,5);
>>> ti = interexceedance_times(Ie)
>>> ei = extremal_idx(ti)
>>> ei
1
See also
--------
reslife, fitgenparrange, disprsnidx, findpot, decluster
Reference
---------
Christopher A. T. Ferro, Johan Segers (2003)
Inference for clusters of extreme values
Journal of the Royal Statistical society: Series B
(Statistical Methodology) 54 (2), 545-556
doi:10.1111/1467-9868.00401
"""
t = arr(ti)
tmax = t.max()
if tmax <= 1:
ei = 0
elif tmax <= 2:
ei = min(1, 2 * t.mean() ** 2 / ((t ** 2).mean()))
else:
ei = min(1, 2 * np.mean(t - 1) ** 2 / np.mean((t - 1) * (t - 2)))
return ei
def _logit(p):
return np.log(p) - np.log1p(-p)
def _logitinv(x):
return 1.0 / (np.exp(-x) + 1)
class RegLogit(object):
"""
REGLOGIT Fit ordinal logistic regression model.
CALL model = reglogit (options)
model = fitted model object with methods
.compare() : Compare small LOGIT object versus large one
.predict() : Predict from a fitted LOGIT object
.summary() : Display summary of fitted LOGIT object.
y = vector of K ordered categories
x = column vectors of covariates
options = struct defining performance of REGLOGIT
.maxiter : maximum number of iterations.
.accuracy : accuracy in convergence.
.betastart : Start value for BETA (default 0)
.thetastart : Start value for THETA (default depends on Y)
.alpha : Confidence coefficent (default 0.05)
.verbose : 1 display summary info about fitted model
2 display convergence info in each iteration
otherwise no action
.deletecolinear : If true delete colinear covarites (default)
Methods
.predict : Predict from a fitted LOGIT object
.summary : Display summary of fitted LOGIT object.
.compare : Compare small LOGIT versus large one
Suppose Y takes values in K ordered categories, and let
gamma_i (x) be the cumulative probability that Y
falls in one of the first i categories given the covariate
X. The ordinal logistic regression model is
logit (mu_i (x)) = theta_i + beta' * x, i = 1...k-1
The number of ordinal categories, K, is taken to be the number
of distinct values of round (Y). If K equals 2,
Y is binary and the model is ordinary logistic regression. The
matrix X is assumed to have full column rank.
Given Y only, theta = REGLOGIT(Y) fits the model with baseline logit odds
only.
Example
y=[1 1 2 1 3 2 3 2 3 3]'
x = (1:10)'
b = reglogit(y,x)
b.display() % members and methods
b.get() % return members
b.summary()
[mu,plo,pup] = b.predict();
plot(x,mu,'g',x,plo,'r:',x,pup,'r:')
y2 = [zeros(5,1);ones(5,1)];
x1 = [29,30,31,31,32,29,30,31,32,33];
x2 = [62,83,74,88,68,41,44,21,50,33];
X = [x1;x2].';
b2 = reglogit(y2,X);
b2.summary();
b21 = reglogit(y2,X(:,1));
b21.compare(b2)
See also regglm, reglm, regnonlm
"""
# Original for MATLAB written by Gordon K Smyth <gks@maths.uq.oz.au>,
# U of Queensland, Australia, on Nov 19, 1990. Last revision Aug 3,
# 1992.
#
# Author: Gordon K Smyth <gks@maths.uq.oz.au>,
# Revised by: pab
# -renamed from oridinal to reglogit
# -added predict, summary and compare
# Description: Ordinal logistic regression
#
# Uses the auxiliary functions logistic_regression_derivatives and
# logistic_regression_likelihood.
def __init__(self, maxiter=500, accuracy=1e-6, alpha=0.05,
deletecolinear=True, verbose=False):
self.maxiter = maxiter
self.accuracy = accuracy
self.alpha = alpha
self.deletecolinear = deletecolinear
self.verbose = False
self.family = None
self.link = None
self.numvar = None
self.numobs = None
self.numk = None
self.df = None
self.df_null = None
self.params = None
self.params_ci = None
self.params_cov = None
self.params_std = None
self.params_corr = None
self.params_tstat = None
self.params_pvalue = None
self.mu = None
self.eta = None
self.X = None
self.Y = None
self.theta = None
self.beta = None
self.residual = None
self.residual1d = None
self.deviance = None
self.deviance_null = None
self.d2L = None
self.dL = None
self.dispersionfit = None
self.dispersion = 1
self.R2 = None
self.R2adj = None
self.numiter = None
self.converged = None
self.note = ''
self.date = now()
def check_xy(self, y, X):
y = np.round(np.atleast_2d(y))
my = y.shape[0]
if X is None:
X = np.zeros((my, 0))
elif self.deletecolinear:
X = np.atleast_2d(X)
# Make sure X is full rank
s = np.linalg.svd(X)[1]
tol = max(X.shape) * np.finfo(s.max()).eps
ix = np.flatnonzero(s > tol)
iy = np.flatnonzero(s <= tol)
if len(ix):
X = X[:, ix]
txt = [' %d,' % i for i in iy]
#txt[-1] = ' %d' % iy[-1]
warnings.warn(
'Covariate matrix is singular. Removing column(s):%s' %
txt)
mx = X.shape[0]
if (mx != my):
raise ValueError(
'x and y must have the same number of observations')
return y, X
def fit(self, y, X=None, theta0=None, beta0=None):
"""
Member variables
.df : degrees of freedom for error.
.params : estimated model parameters
.params_ci : 100(1-alpha)% confidence interval for model parameters
.params_tstat : t statistics for model's estimated parameters.
.params_pvalue: p value for model's estimated parameters.
.params_std : standard errors for estimated parameters
.params_corr : correlation matrix for estimated parameters.
.mu : fitted values for the model.
.eta : linear predictor for the model.
.residual : residual for the model (Y-E(Y|X)).
.dispersnfit : The estimated error variance
.deviance : deviance for the model equal minus twice the
log-likelihood.
.d2L : Hessian matrix (double derivative of log-likelihood)
.dL : First derivative of loglikelihood w.r.t. THETA and BETA.
"""
self.family = 'multinomial'
self.link = 'logit'
y, X = self.check_xy(y, X)
# initial calculations
tol = self.accuracy
incr = 10
decr = 2
ymin = y.min()
ymax = y.max()
yrange = ymax - ymin
z = (y * ones((1, yrange))) == ((y * 0 + 1) * np.arange(ymin, ymax))
z1 = (y * ones((1, yrange))) == (
(y * 0 + 1) * np.arange(ymin + 1, ymax + 1))
z = z[:, np.flatnonzero(z.any(axis=0))]
z1 = z1[:, np.flatnonzero(z1.any(axis=0))]
_mz, nz = z.shape
_mx, nx = X.shape
my, _ny = y.shape
g = (z.sum(axis=0).cumsum() / my).reshape(-1, 1)
theta00 = np.log(g / (1 - g)).ravel()
beta00 = np.zeros((nx,))
# starting values
if theta0 is None:
theta0 = theta00
if beta0 is None:
beta0 = beta00
tb = np.hstack((theta0, beta0))
# likelihood and derivatives at starting values
[dev, dl, d2l] = self.loglike(tb, y, X, z, z1)
epsilon = np.std(d2l) / 1000
if np.any(beta0) or np.any(theta00 != theta0):
tb0 = np.vstack((theta00, beta00))
nulldev = self.loglike(tb0, y, X, z, z1)[0]
else:
nulldev = dev
# maximize likelihood using Levenberg modified Newton's method
for i in range(self.maxiter + 1):
tbold = tb
devold = dev
tb = tbold - np.linalg.lstsq(d2l, dl)[0]
[dev, dl, d2l] = self.loglike(tb, y, X, z, z1)
if ((dev - devold) / np.dot(dl, tb - tbold) < 0):
epsilon = epsilon / decr
else:
while ((dev - devold) / np.dot(dl, tb - tbold) > 0):
epsilon = epsilon * incr
if (epsilon > 1e+15):
raise ValueError('epsilon too large')
tb = tbold - \
np.linalg.lstsq(d2l - epsilon * np.eye(d2l.shape), dl)
[dev, dl, d2l] = self.loglike(tb, y, X, z, z1)
print('epsilon %g' % epsilon)
# end %while
# end else
#[dl, d2l] = logistic_regression_derivatives (X, z, z1, g, g1, p);
if (self.verbose > 1):
print('Iter: %d, Deviance: %8.6f', iter, dev)
print('First derivative')
print(dl)
print('Eigenvalues of second derivative')
print(np.linalg.eig(d2l)[0].T)
# end
# end
stop = np.abs(
np.dot(dl, np.linalg.lstsq(d2l, dl)[0]) / len(dl)) <= tol
if stop:
break
# end %while
# tidy up output
theta = tb[:nz, ]
beta = tb[nz:(nz + nx)]
pcov = np.linalg.pinv(-d2l)
se = sqrt(np.diag(pcov))
if (nx > 0):
eta = ((X * beta) * ones((1, nz))) + ((y * 0 + 1) * theta)
else:
eta = (y * 0 + 1) * theta
# end
gammai = np.diff(
np.hstack(((y * 0), _logitinv(eta), (y * 0 + 1))), n=1, axis=1)
k0 = min(y)
mu = (k0 - 1) + np.dot(gammai, np.arange(1, nz + 2)).reshape(-1, 1)
r = np.corrcoef(np.hstack((y, mu)).T)
R2 = r[0, 1] ** 2
# coefficient of determination
# adjusted coefficient of determination
R2adj = max(1 - (1 - R2) * (my - 1) / (my - nx - nz - 1), 0)
res = y - mu
if nz == 1:
self.family = 'binomial'
else:
self.family = 'multinomial'
self.link = 'logit'
self.numvar = nx + nz
self.numobs = my
self.numk = nz + 1
self.df = max(my - nx - nz, 0)
self.df_null = my - nz
# nulldf; nulldf = n - nz
self.params = tb[:(nz + nx)]
self.params_ci = 1
self.params_std = se
self.params_cov = pcov
self.params_tstat = (self.params / self.params_std)
# options.estdispersn dispersion_parameter=='mean_deviance'
if False:
self.params_pvalue = 2. * _cdft(-abs(self.params_tstat), self.df)
bcrit = -se * _invt(self.alpha / 2, self.df)
else:
self.params_pvalue = 2. * _cdfnorm(-abs(self.params_tstat))
bcrit = -se * _invnorm(self.alpha / 2)
# end
self.params_ci = np.vstack((self.params + bcrit, self.params - bcrit))
self.mu = gammai
self.eta = _logit(gammai)
self.X = X
[dev, dl, d2l, p] = self.loglike(tb, y, X, z, z1, numout=4)
self.theta = theta
self.beta = beta
self.gamma = gammai
self.residual = res.T
self.residualD = np.sign(self.residual) * sqrt(-2 * np.log(p))
self.deviance = dev
self.deviance_null = nulldev
self.d2L = d2l
self.dL = dl.T
self.dispersionfit = 1
self.dispersion = 1
self.R2 = R2
self.R2adj = R2adj
self.numiter = i
self.converged = i < self.maxiter
self.note = ''
self.date = now()
if (self.verbose):
self.summary()
def compare(self, object2):
""" Compare small LOGIT versus large one
CALL [pvalue] = compare(object2)
The standard hypothesis test of a larger linear regression
model against a smaller one. The standard Chi2-test is used.
The output is the p-value, the residuals from the smaller
model, and the residuals from the larger model.
See also fitls
"""
try:
if self.numvar > object2.numvar:
devL = self.deviance
nL = self.numvar
dfL = self.df
Al = self.X
disprsn = self.dispersionfit
devs = object2.deviance
ns = object2.numvar
dfs = object2.df
As = object2.X
else:
devL = object2.deviance
nL = object2.numvar
dfL = object2.df
Al = object2.X
disprsn = object2.dispersionfit
devs = self.deviance
ns = self.numvar
dfs = self.df
As = self.X
# end
if (((As - np.dot(Al * np.linalg.lstsq(Al, As))) > 500 * np.finfo(float).eps).any() or
object2.family != self.family or object2.link != self.link):
warnings.warn('Small model not included in large model,' +
' result is rubbish!')
except:
raise ValueError('Apparently not a valid regression object')
pmq = np.abs(nL - ns)
print(' ')
print(' Analysis of Deviance')
if False: # options.estdispersn
localstat = abs(devL - devs) / disprsn / pmq
# localpvalue = 1-cdff(localstat, pmq, dfL)
# print('Model DF Residual deviance F-stat Pr(>F)')
else:
localstat = abs(devL - devs) / disprsn
localpvalue = 1 - _cdfchi2(localstat, pmq)
print('Model DF Residual deviance Chi2-stat ' +
' Pr(>Chi2)')
# end
print('Small %d %12.4f %12.4f %12.4f' %
(dfs, devs, localstat, localpvalue))
print('Full %d %12.4f' % (dfL, devL))
print(' ')
return localpvalue
def anode(self):
print(' ')
print(' Analysis of Deviance')
if False: # %options.estdispersn
localstat = abs(self.deviance_null - self.deviance) / \
self.dispersionfit / (self.numvar - 1)
localpvalue = 1 - _cdff(localstat, self.numvar - 1, self.df)
print(
'Model DF Residual deviance F-stat Pr(>F)')
else:
localstat = abs(
self.deviance_null - self.deviance) / self.dispersionfit
localpvalue = 1 - _cdfchi2(localstat, self.numvar - 1)
print('Model DF Residual deviance Chi2-stat' +
' Pr(>Chi2)')
# end
print('Null %d %12.4f %12.4f %12.4f' %
(self.df_null, self.deviance_null, localstat, localpvalue))
print('Full %d %12.4f' % (self.df, self.deviance))
print(' ')
print(' R2 = %2.4f, R2adj = %2.4f' % (self.R2, self.R2adj))
print(' ')
return localpvalue
def summary(self):
txtlink = self.link
print('Call:')
print('reglogit(formula = %s(Pr(grp(y)<=i)) ~ theta_i+beta*x, family = %s)' %
(txtlink, self.family))
print(' ')
print('Deviance Residuals:')
m, q1, me, q3, M = np.percentile(
self.residualD, q=[0, 25, 50, 75, 100])
print(' Min 1Q Median 3Q Max ')
print('%2.4f %2.4f %2.4f %2.4f %2.4f' %
(m, q1, me, q3, M))
print(' ')
print(' Coefficients:')
if False: # %options.estdispersn
print(
' Estimate Std. Error t value Pr(>|t|)')
else:
print(
' Estimate Std. Error z value Pr(>|z|)')
# end
e, s, z, p = (self.params, self.params_std, self.params_tstat,
self.params_pvalue)
for i in range(self.numk):
print(
'theta_%d %2.4f %2.4f %2.4f %2.4f' %
(i, e[i], s[i], z[i], p[i]))
for i in range(self.numk, self.numvar):
print(
' beta_%d %2.4f %2.4f %2.4f %2.4f\n' %
(i - self.numk, e[i], s[i], z[i], p[i]))
print(' ')
print('(Dispersion parameter for %s family taken to be %2.2f)' %
(self.family, self.dispersionfit))
print(' ')
if True: # %options.constant
print(' Null deviance: %2.4f on %d degrees of freedom' %
(self.deviance_null, self.df_null))
# end
print('Residual deviance: %2.4f on %d degrees of freedom' %
(self.deviance, self.df))
self.anode()
#end % summary
def predict(self, Xnew=None, alpha=0.05, fulloutput=False):
"""LOGIT/PREDICT Predict from a fitted LOGIT object
CALL [y,ylo,yup] = predict(Xnew,options)
y = predicted value
ylo,yup = 100(1-alpha)% confidence interval for y
Xnew = new covariate
options = options struct defining the calculation
.alpha : confidence coefficient (default 0.05)
.size : size if binomial family (default 1).
"""
[_mx, nx] = self.X.shape
if Xnew is None:
Xnew = self.X
else:
Xnew = np.atleast_2d(Xnew)
notnans = np.flatnonzero(1 - (1 - np.isfinite(Xnew)).any(axis=1))
Xnew = Xnew[notnans, :]
[n, p] = Xnew.shape
if p != nx:
raise ValueError('Number of covariates must match the number' +
' of regression coefficients')
nz = self.numk - 1
one = ones((n, 1))
if (nx > 0):
eta = np.dot(Xnew, self.beta).reshape(-1, 1) + self.theta
else:
eta = one * self.theta
# end
y = np.diff(
np.hstack((zeros((n, 1)), _logitinv(eta), one)), n=1, axis=1)
if fulloutput:
eps = np.finfo(float).eps
pcov = self.params_cov
if (nx > 0):
np1 = pcov.shape[0]
[U, S, V] = np.linalg.svd(pcov, 0)
# %squareroot of pcov
R = np.dot(U, np.dot(np.diag(sqrt(S)), V))
ib = np.r_[0, nz:np1]
# Var(eta_i) = var(theta_i+Xnew*b)
vareta = zeros((n, nz))
u = np.hstack((one, Xnew))
for i in range(nz):
ib[0] = i
vareta[:, i] = np.maximum(
((np.dot(u, R[ib][:, ib])) ** 2).sum(axis=1), eps)
# end
else:
vareta = np.diag(pcov)
# end
crit = -_invnorm(alpha / 2)
ecrit = crit * sqrt(vareta)
mulo = _logitinv(eta - ecrit)
muup = _logitinv(eta + ecrit)
ylo1 = np.diff(np.hstack((zeros((n, 1)), mulo, one)), n=1, axis=1)
yup1 = np.diff(np.hstack((zeros((n, 1)), muup, one)), n=1, axis=1)
ylo = np.minimum(ylo1, yup1)
yup = np.maximum(ylo1, yup1)
for i in range(1, nz): # = 2:self.numk-1
yup[:, i] = np.vstack(
(yup[:, i], muup[:, i] - mulo[:, i - 1])).max(axis=0)
# end
return y, ylo, yup
return y
def loglike(self, beta, y, x, z, z1, numout=3):
"""
[dev, dl, d2l, p] = loglike( y ,x,beta,z,z1)
Calculates likelihood for the ordinal logistic regression model.
"""
# Author: Gordon K. Smyth <gks@maths.uq.oz.au>
zx = np.hstack((z, x))
z1x = np.hstack((z1, x))
g = _logitinv(np.dot(zx, beta)).reshape((-1, 1))
g1 = _logitinv(np.dot(z1x, beta)).reshape((-1, 1))
g = np.maximum(y == y.max(), g)
g1 = np.minimum(y > y.min(), g1)
p = g - g1
dev = -2 * np.log(p).sum()
"""[dl, d2l] = derivatives of loglike(beta, y, x, z, z1)
% Called by logistic_regression. Calculates derivates of the
% log-likelihood for ordinal logistic regression model.
"""
# Author: Gordon K. Smyth <gks@maths.uq.oz.au>
# Description: Derivates of log-likelihood in logistic regression
# first derivative
v = g * (1 - g) / p
v1 = g1 * (1 - g1) / p
dlogp = np.hstack((((v * z) - (v1 * z1)), ((v - v1) * x)))
dl = np.sum(dlogp, axis=0)
# second derivative
w = v * (1 - 2 * g)
w1 = v1 * (1 - 2 * g1)
d2l = np.dot(zx.T, (w * zx)) - np.dot(
z1x.T, (w1 * z1x)) - np.dot(dlogp.T, dlogp)
if numout == 4:
return dev, dl, d2l, p
else:
return dev, dl, d2l
#end %function
def _test_dispersion_idx():
import wafo.data
xn = wafo.data.sea()
t, data = xn.T
Ie = findpot(data, t, 0, 5)
di, _u, _ok_u = dispersion_idx(data[Ie], t[Ie], tb=100)
di.plot() # a threshold around 1 seems appropriate.
di.show()
def _test_findpot():
import pylab
import wafo.data
from wafo.misc import findtc
x = wafo.data.sea()
t, data = x[:, :].T
itc, _iv = findtc(data, 0, 'dw')
ytc, ttc = data[itc], t[itc]
ymin = 2 * data.std()
tmin = 10 # sec
I = findpot(data, t, ymin, tmin)
yp, tp = data[I], t[I]
Ie = findpot(yp, tp, ymin, tmin)
ye, te = yp[Ie], tp[Ie]
pylab.plot(t, data, ttc, ytc, 'ro', t,
zeros(len(t)), ':', te, ye, 'kx', tp, yp, '+')
pylab.show()
pass
def _test_reslife():
import wafo
R = wafo.stats.genpareto.rvs(0.1, 2, 2, size=100)
mrl = reslife(R, nu=20)
mrl.plot()
def test_reglogit():
y = np.array([1, 1, 2, 1, 3, 2, 3, 2, 3, 3]).reshape(-1, 1)
x = np.arange(1, 11).reshape(-1, 1)
b = RegLogit()
b.fit(y, x)
# b.display() # members and methods
b.summary()
[mu, plo, pup] = b.predict(fulloutput=True) # @UnusedVariable
pass
# plot(x,mu,'g',x,plo,'r:',x,pup,'r:')
def test_reglogit2():
n = 40
x = np.sort(5 * np.random.rand(n, 1) - 2.5, axis=0)
y = (np.cos(x) > 2 * np.random.rand(n, 1) - 1)
b = RegLogit()
b.fit(y, x)
# b.display() # members and methods
b.summary()
[mu, plo, pup] = b.predict(fulloutput=True)
import matplotlib.pyplot as pl
pl.plot(x, mu, 'g', x, plo, 'r:', x, pup, 'r:')
pl.show()
def test_sklearn0():
from sklearn.linear_model import LogisticRegression
from sklearn import datasets # @UnusedImport
# FIXME: the iris dataset has only 4 features!
# iris = datasets.load_iris()
# X = iris.data
# y = iris.target
X = np.sort(5 * np.random.rand(40, 1) - 2.5, axis=0)
y = (2 * (np.cos(X) > 2 * np.random.rand(40, 1) - 1) - 1).ravel()
score = []
# Set regularization parameter
cvals = np.logspace(-1, 1, 5)
for C in cvals:
clf_LR = LogisticRegression(C=C, penalty='l2')
clf_LR.fit(X, y)
score.append(clf_LR.score(X, y))
#plot(cvals, score)
def test_sklearn():
X = np.sort(5 * np.random.rand(40, 1) - 2.5, axis=0)
y = (2 * (np.cos(X) > 2 * np.random.rand(40, 1) - 1) - 1).ravel()
from sklearn.svm import SVR
#
# look at the results
import pylab as pl
pl.scatter(X, .5 * np.cos(X) + 0.5, c='k', label='True model')
pl.hold('on')
cvals = np.logspace(-1, 3, 20)
score = []
for c in cvals:
svr_rbf = SVR(kernel='rbf', C=c, gamma=0.1, probability=True)
svrf = svr_rbf.fit(X, y)
y_rbf = svrf.predict(X)
score.append(svrf.score(X, y))
pl.plot(X, y_rbf, label='RBF model c=%g' % c)
pl.xlabel('data')
pl.ylabel('target')
pl.title('Support Vector Regression')
pl.legend()
pl.show()
def test_sklearn1():
X = np.sort(5 * np.random.rand(40, 1) - 2.5, axis=0)
y = (2 * (np.cos(X) > 2 * np.random.rand(40, 1) - 1) - 1).ravel()
from sklearn.svm import SVR
# cvals= np.logspace(-1,4,10)
svr_rbf = SVR(kernel='rbf', C=1e4, gamma=0.1, probability=True)
svr_lin = SVR(kernel='linear', C=1e4, probability=True)
svr_poly = SVR(kernel='poly', C=1e4, degree=2, probability=True)
y_rbf = svr_rbf.fit(X, y).predict(X)
y_lin = svr_lin.fit(X, y).predict(X)
y_poly = svr_poly.fit(X, y).predict(X)
#
# look at the results
import pylab as pl
pl.scatter(X, .5 * np.cos(X) + 0.5, c='k', label='True model')
pl.hold('on')
pl.plot(X, y_rbf, c='g', label='RBF model')
pl.plot(X, y_lin, c='r', label='Linear model')
pl.plot(X, y_poly, c='b', label='Polynomial model')
pl.xlabel('data')
pl.ylabel('target')
pl.title('Support Vector Regression')
pl.legend()
pl.show()
def test_doctstrings():
#_test_dispersion_idx()
import doctest
doctest.testmod()
if __name__ == '__main__':
# test_reglogit2()
test_doctstrings()