You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1637 lines
55 KiB
Python

'''
Contains FitDistribution and Profile class, which are
important classes for fitting to various Continous and Discrete Probability
Distributions
Author: Per A. Brodtkorb 2008
'''
from __future__ import division, absolute_import
import warnings
from wafo.plotbackend import plotbackend as plt
from wafo.misc import ecross, findcross, argsreduce
from wafo.stats._constants import _EPS, _XMAX, _XMIN
from wafo.stats._distn_infrastructure import rv_frozen, rv_continuous
from scipy._lib.six import string_types
import numdifftools as nd # @UnresolvedImport
from scipy import special
from scipy.linalg import pinv2
from scipy import optimize
import numpy as np
from scipy.special import expm1, log1p
from numpy import (arange, zeros, log, sqrt, exp,
asarray, nan, pi, isfinite)
from numpy import flatnonzero as nonzero
__all__ = ['Profile', 'FitDistribution']
floatinfo = np.finfo(float)
arr = asarray
# all = alltrue # @ReservedAssignment
15 years ago
9 years ago
def _burr_link(x, logsf, phat, ix):
c, d, loc, scale = phat
9 years ago
logp = log(-expm1(logsf))
xn = (x - loc) / scale
if ix == 1:
return -logp / log1p(xn**(-c))
if ix == 0:
return log1p(-exp(-logp / d)) / log(xn)
if ix == 2:
return x - scale * exp(log1p(-exp(-logp / d)) / c)
if ix == 3:
return (x - loc) / exp(log1p(-exp(-logp / d)) / c)
raise IndexError('Index to the fixed parameter is out of bounds')
9 years ago
def _expon_link(x, logsf, phat, ix):
if ix == 1:
9 years ago
return - (x - phat[0]) / logsf
if ix == 0:
9 years ago
return x + phat[1] * logsf
raise IndexError('Index to the fixed parameter is out of bounds')
9 years ago
def _weibull_min_link(x, logsf, phat, ix):
c, loc, scale = phat
if ix == 0:
9 years ago
return log(-logsf) / log((x - loc) / scale)
if ix == 1:
9 years ago
return x - scale * (-logsf) ** (1. / c)
if ix == 2:
9 years ago
return (x - loc) / (-logsf) ** (1. / c)
raise IndexError('Index to the fixed parameter is out of bounds')
9 years ago
def _exponweib_link(x, logsf, phat, ix):
a, c, loc, scale = phat
9 years ago
logP = -log(-expm1(logsf))
xn = (x - loc) / scale
if ix == 0:
return - logP / log(-expm1(-xn**c))
if ix == 1:
return log(-log1p(- logP**(1.0 / a))) / log(xn)
if ix == 2:
return x - (-log1p(- logP**(1.0/a))) ** (1.0 / c) * scale
if ix == 3:
return (x - loc) / (-log1p(- logP**(1.0/a))) ** (1.0 / c)
raise IndexError('Index to the fixed parameter is out of bounds')
9 years ago
def _genpareto_link(x, logsf, phat, ix):
# Reference
# Stuart Coles (2004)
# "An introduction to statistical modelling of extreme values".
# Springer series in statistics
c, loc, scale = phat
if ix == 2:
# Reorganizing w.r.t.scale, Eq. 4.13 and 4.14, pp 81 in
# Coles (2004) gives
9 years ago
# link = -(x-loc)*c/expm1(-c*logsf)
if c != 0.0:
9 years ago
phati = (x - loc) * c / expm1(-c * logsf)
else:
9 years ago
phati = -(x - loc) / logsf
elif ix == 1:
if c != 0:
9 years ago
phati = x + scale * expm1(c * logsf) / c
else:
9 years ago
phati = x + scale * logsf
elif ix == 0:
raise NotImplementedError(
9 years ago
'link(x,logsf,phat,i) where i=0 is not implemented!')
else:
raise IndexError('Index to the fixed parameter is out of bounds')
return phati
9 years ago
def _genextreme_link(x, logsf, phat, ix):
c, loc, scale = phat
9 years ago
loglogP = log(-log(-expm1(logsf)))
if ix == 2:
# link = -(x-loc)*c/expm1(c*log(-logP))
if c != 0.0:
return -(x - loc) * c / expm1(c * loglogP)
return -(x - loc) / loglogP
if ix == 1:
if c != 0:
return x + scale * expm1(c * loglogP) / c
return x + scale * loglogP
if ix == 0:
raise NotImplementedError(
9 years ago
'link(x,logsf,phat,i) where i=0 is not implemented!')
raise IndexError('Index to the fixed parameter is out of bounds')
9 years ago
def _genexpon_link(x, logsf, phat, ix):
a, b, c, loc, scale = phat
xn = (x - loc) / scale
fact1 = (xn + expm1(-c * xn) / c)
if ix == 0:
9 years ago
return b * fact1 + logsf # a
if ix == 1:
9 years ago
return (a - logsf) / fact1 # b
if ix in [2, 3, 4]:
raise NotImplementedError('Only implemented for index in [0,1]!')
raise IndexError('Index to the fixed parameter is out of bounds')
9 years ago
def _rayleigh_link(x, logsf, phat, ix):
if ix == 1:
9 years ago
return x - phat[0] / sqrt(-2.0 * logsf)
if ix == 0:
9 years ago
return x - phat[1] * sqrt(-2.0 * logsf)
raise IndexError('Index to the fixed parameter is out of bounds')
9 years ago
def _trunclayleigh_link(x, logsf, phat, ix):
c, loc, scale = phat
if ix == 0:
xn = (x - loc) / scale
9 years ago
return - 2 * logsf / xn - xn / 2.0
if ix == 2:
9 years ago
return x - loc / (sqrt(c * c - 2 * logsf) - c)
if ix == 1:
9 years ago
return x - scale * (sqrt(c * c - 2 * logsf) - c)
raise IndexError('Index to the fixed parameter is out of bounds')
LINKS = dict(expon=_expon_link,
weibull_min=_weibull_min_link,
frechet_r=_weibull_min_link,
genpareto=_genpareto_link,
genexpon=_genexpon_link,
rayleigh=_rayleigh_link,
trunclayleigh=_trunclayleigh_link,
genextreme=_genextreme_link,
exponweib=_exponweib_link,
burr=_burr_link)
def chi2isf(p, df):
return special.chdtri(df, p)
def chi2sf(x, df):
return special.chdtrc(df, x)
def norm_ppf(q):
return special.ndtri(q)
class Profile(object):
'''
Profile Log- likelihood or Product Spacing-function for phat[i].
9 years ago
Parameters
----------
fit_dist : FitDistribution object
with ML or MPS estimated distribution parameters.
i : scalar integer
defining which distribution parameter to keep fixed in the
profiling process (default first non-fixed parameter)
pmin, pmax : real scalars
Interval for the parameter, phat[i] used in the optimization of the
profile function (default is based on the 100*(1-alpha)% confidence
interval computed with the delta method.)
9 years ago
n : scalar integer
Max number of points used in Lp (default 100)
alpha : real scalar
confidence coefficent (default 0.05)
9 years ago
Returns
-------
Lp : Profile log-likelihood function with parameters phat given
the data and phat[i], i.e.,
Lp = max(log(f(phat| data, phat[i])))
9 years ago
Member methods
-------------
plot() : Plot profile function with 100(1-alpha)% confidence interval
get_bounds() : Return 100(1-alpha)% confidence interval
Member variables
----------------
fit_dist : FitDistribution data object.
data : profile function values
args : profile function arguments
alpha : confidence coefficient
Lmax : Maximum value of profile function
alpha_cross_level :
PROFILE is a utility function for making inferences on a particular
component of the vector phat.
9 years ago
This is usually more accurate than using the delta method assuming
asymptotic normality of the ML estimator or the MPS estimator.
Examples
--------
# MLE
>>> import wafo.stats as ws
>>> R = ws.weibull_min.rvs(1,size=100);
>>> phat = FitDistribution(ws.weibull_min, R, 1, scale=1, floc=0.0)
# Better 90% CI for phat.par[i=0]
9 years ago
>>> profile_phat_i = Profile(phat, i=0)
>>> profile_phat_i.plot()
>>> phat_ci = profile_phat_i.get_bounds(alpha=0.1)
9 years ago
'''
def __init__(self, fit_dist, i=None, pmin=None, pmax=None, n=100,
alpha=0.05):
self.fit_dist = fit_dist
self.pmin = pmin
self.pmax = pmax
self.n = n
self.alpha = alpha
self.data = None
self.args = None
self._set_indexes(fit_dist, i)
method = fit_dist.method.lower()
self._set_plot_labels(fit_dist, method)
9 years ago
Lmax = self._loglike_max(fit_dist, method)
self.Lmax = Lmax
self.alpha_Lrange = 0.5 * chi2isf(self.alpha, 1)
self.alpha_cross_level = Lmax - self.alpha_Lrange
self._set_profile()
def _set_plot_labels(self, fit_dist, method):
percent = 100 * (1.0 - self.alpha)
self.title = '{:g}% CI for {:s} params'.format(percent,
fit_dist.dist.name)
like_txt = 'likelihood' if method == 'ml' else 'product spacing'
self.ylabel = 'Profile log' + like_txt
self.xlabel = 'phat[{}]'.format(np.ravel(self.i_fixed)[0])
9 years ago
@staticmethod
def _loglike_max(fit_dist, method):
9 years ago
if method.startswith('ml'):
Lmax = fit_dist.LLmax
elif method.startswith('mps'):
Lmax = fit_dist.LPSmax
else:
raise ValueError(
"PROFILE is only valid for ML- or MPS- estimators")
return Lmax
@staticmethod
def _default_i_fixed(fit_dist):
9 years ago
try:
i0 = 1 - np.isfinite(fit_dist.par_fix).argmax()
except Exception:
9 years ago
i0 = 0
return i0
@staticmethod
def _get_not_fixed_mask(fit_dist):
9 years ago
if fit_dist.par_fix is None:
isnotfixed = np.ones(fit_dist.par.shape, dtype=bool)
else:
isnotfixed = 1 - np.isfinite(fit_dist.par_fix)
return isnotfixed
def _check_i_fixed(self):
if self.i_fixed not in self.i_notfixed:
raise IndexError("Index i must be equal to an index to one of " +
"the free parameters.")
def _set_indexes(self, fit_dist, i):
if i is None:
i = self._default_i_fixed(fit_dist)
self.i_fixed = np.atleast_1d(i)
isnotfixed = self._get_not_fixed_mask(fit_dist)
self.i_notfixed = nonzero(isnotfixed)
self._check_i_fixed()
isfree = isnotfixed
isfree[self.i_fixed] = False
self.i_free = nonzero(isfree)
@staticmethod
def _local_link(fix_par, par):
"""
Return fixed distribution parameter
"""
9 years ago
return fix_par
def _correct_Lmax(self, Lmax, free_par, fix_par):
if Lmax > self.Lmax: # foundNewphat = True
dL = self.Lmax - Lmax
self.alpha_cross_level -= dL
self.Lmax = Lmax
par = self._par.copy()
par[self.i_free] = free_par
par[self.i_fixed] = fix_par
self.best_par = par
self._par = par
warnings.warn(
'The fitted parameters does not provide the optimum fit. ' +
'Something wrong with fit (par = {})'.format(str(par)))
def _profile_optimum(self, phatfree0, p_opt):
phatfree = optimize.fmin(self._profile_fun, phatfree0, args=(p_opt,),
disp=0)
9 years ago
Lmax = -self._profile_fun(phatfree, p_opt)
self._correct_Lmax(Lmax, phatfree, p_opt)
return Lmax, phatfree
def _get_p_opt(self):
return self._par[self.i_fixed]
def _set_profile(self):
self._par = self.fit_dist.par.copy()
9 years ago
# Set up variable to profile and _local_link function
p_opt = self._get_p_opt()
9 years ago
phatfree = self._par[self.i_free].copy()
pvec = self._get_pvec(phatfree, p_opt)
self.data = np.ones_like(pvec) * nan
k1 = (pvec >= p_opt).argmax()
for size, step in ((-1, -1), (np.size(pvec), 1)):
9 years ago
phatfree = self._par[self.i_free].copy()
for ix in range(k1, size, step):
Lmax, phatfree = self._profile_optimum(phatfree, pvec[ix])
self.data[ix] = Lmax
if ix != k1 and Lmax < self.alpha_cross_level:
9 years ago
break
np.putmask(pvec, np.isnan(self.data), nan)
self.args = pvec
self._prettify_profile()
def _prettify_profile(self):
pvec = self.args
ix = nonzero(np.isfinite(pvec))
self.data = self.data[ix]
self.args = pvec[ix]
cond = self.data == -np.inf
if np.any(cond):
9 years ago
ind, = cond.nonzero()
self.data.put(ind, floatinfo.min / 2.0)
ind1 = np.where(ind == 0, ind, ind - 1)
cl = self.alpha_cross_level - self.alpha_Lrange / 2.0
try:
t0 = ecross(self.args, self.data, ind1, cl)
self.data.put(ind, cl)
self.args.put(ind, t0)
except IndexError as err:
warnings.warn(str(err))
9 years ago
def _get_variance(self):
return self.fit_dist.par_cov[self.i_fixed, :][:, self.i_fixed]
def _approx_p_min_max(self, p_opt):
pvar = self._get_variance()
if pvar <= 1e-5 or np.isnan(pvar):
pvar = max(abs(p_opt) * 0.5, 0.2)
pvar = max(pvar, 0.1)
p_crit = -norm_ppf(self.alpha / 2.0) * sqrt(np.ravel(pvar)) * 1.5
return p_opt - p_crit * 5, p_opt + p_crit * 5
def _p_min_max(self, phatfree0, p_opt):
p_low, p_up = self._approx_p_min_max(p_opt)
pmin, pmax = self.pmin, self.pmax
if pmin is None:
pmin = self._search_pmin(phatfree0, p_low, p_opt)
if pmax is None:
pmax = self._search_pmax(phatfree0, p_up, p_opt)
return pmin, pmax
def _adaptive_pvec(self, p_opt, pmin, pmax):
p_crit_low = (p_opt - pmin) / 5
p_crit_up = (pmax - p_opt) / 5
n4 = np.floor(self.n / 4.0)
a, b = p_opt - p_crit_low, p_opt + p_crit_up
pvec1 = np.linspace(pmin, a, n4 + 1)
pvec2 = np.linspace(a, b, self.n - 2 * n4)
pvec3 = np.linspace(b, pmax, n4 + 1)
pvec = np.unique(np.hstack((pvec1, p_opt, pvec2, pvec3)))
return pvec
def _get_pvec(self, phatfree0, p_opt):
''' return proper interval for the variable to profile
'''
if self.pmin is None or self.pmax is None:
pmin, pmax = self._p_min_max(phatfree0, p_opt)
return self._adaptive_pvec(p_opt, pmin, pmax)
return np.linspace(self.pmin, self.pmax, self.n)
def _search_pmin(self, phatfree0, p_min0, p_opt):
phatfree = phatfree0.copy()
dp = (p_opt - p_min0)/40
if dp < 1e-2:
dp = 0.1
p_min_opt = p_opt - dp
Lmax, phatfree = self._profile_optimum(phatfree, p_opt)
for _j in range(50):
p_min = p_opt - dp
Lmax, phatfree = self._profile_optimum(phatfree, p_min)
if np.isnan(Lmax):
dp *= 0.33
elif Lmax < self.alpha_cross_level - self.alpha_Lrange * 5:
p_min_opt = p_min
dp *= 0.33
elif Lmax < self.alpha_cross_level:
p_min_opt = p_min
break
else:
dp *= 1.67
return p_min_opt
def _search_pmax(self, phatfree0, p_max0, p_opt):
phatfree = phatfree0.copy()
dp = (p_max0 - p_opt)/40
if dp < 1e-2:
dp = 0.1
p_max_opt = p_opt + dp
Lmax, phatfree = self._profile_optimum(phatfree, p_opt)
for _j in range(50):
p_max = p_opt + dp
Lmax, phatfree = self._profile_optimum(phatfree, p_max)
if np.isnan(Lmax):
dp *= 0.33
elif Lmax < self.alpha_cross_level - self.alpha_Lrange * 2:
p_max_opt = p_max
dp *= 0.33
elif Lmax < self.alpha_cross_level:
p_max_opt = p_max
break
else:
dp *= 1.67
return p_max_opt
def _profile_fun(self, free_par, fix_par):
''' Return negative of loglike or logps function
free_par - vector of free parameters
fix_par - fixed parameter, i.e., either quantile (return level),
probability (return period) or distribution parameter
'''
par = self._par.copy()
par[self.i_free] = free_par
9 years ago
par[self.i_fixed] = self._local_link(fix_par, par)
return self.fit_dist.fitfun(par)
def get_bounds(self, alpha=0.05):
'''Return confidence interval for profiled parameter
'''
if alpha < self.alpha:
warnings.warn(
'Might not be able to return bounds with alpha less than %g' %
9 years ago
self.alpha)
cross_level = self.Lmax - 0.5 * chi2isf(alpha, 1)
ind = findcross(self.data, cross_level)
n = len(ind)
if n == 0:
9 years ago
warnings.warn('''Number of crossings is zero, i.e.,
upper and lower bound is not found!''')
bounds = (self.pmin, self.pmax)
9 years ago
elif n == 1:
9 years ago
x0 = ecross(self.args, self.data, ind, cross_level)
isUpcrossing = self.data[ind] > self.data[ind + 1]
if isUpcrossing:
bounds = (x0, self.pmax)
9 years ago
warnings.warn('Upper bound is larger')
else:
bounds = (self.pmin, x0)
9 years ago
warnings.warn('Lower bound is smaller')
elif n == 2:
bounds = ecross(self.args, self.data, ind, cross_level)
9 years ago
else:
warnings.warn('Number of crossings too large! Something is wrong!')
bounds = ecross(self.args, self.data, ind[[0, -1]], cross_level)
return bounds
9 years ago
def plot(self, axis=None):
'''
Plot profile function for p_opt with 100(1-alpha)% confidence interval.
9 years ago
'''
if axis is None:
axis = plt.gca()
9 years ago
p_ci = self.get_bounds(self.alpha)
axis.plot(
self.args, self.data,
p_ci, [self.Lmax, ] * 2, 'r--',
p_ci, [self.alpha_cross_level, ] * 2, 'r--')
ymax = self.Lmax + self.alpha_Lrange/10
ymin = self.alpha_cross_level - self.alpha_Lrange/10
axis.vlines(p_ci, ymin=ymin, ymax=self.Lmax,
color='r', linestyles='--')
p_opt = self._get_p_opt()
axis.vlines(p_opt, ymin=ymin, ymax=self.Lmax,
color='g', linestyles='--')
9 years ago
axis.set_title(self.title)
axis.set_ylabel(self.ylabel)
axis.set_xlabel(self.xlabel)
axis.set_ylim(ymin=ymin, ymax=ymax)
def plot_all_profiles(phats, plot=None):
if plot is not None:
plt = plot
if phats.par_fix:
indices = phats.i_notfixed
else:
indices = np.arange(len(phats.par))
n = len(indices)
for j, k in enumerate(indices):
plt.subplot(n, 1, j+1)
9 years ago
profile_phat_k = Profile(phats, i=k)
m = 0
9 years ago
while hasattr(profile_phat_k, 'best_par') and m < 7:
phats.fit(*profile_phat_k.best_par)
profile_phat_k = Profile(phats, i=k)
m += 1
9 years ago
profile_phat_k.plot()
if j != 0:
plt.title('')
if j != n//2:
plt.ylabel('')
plt.subplots_adjust(hspace=0.5)
par_txt = ('{:1.2g}, '*len(phats.par))[:-2].format(*phats.par)
plt.suptitle('phat = [{}] (fit metod: {})'.format(par_txt, phats.method))
return phats
class ProfileQuantile(Profile):
'''
Profile Log- likelihood or Product Spacing-function for quantile.
Parameters
----------
fit_dist : FitDistribution object
with ML or MPS estimated distribution parameters.
x : real scalar
Quantile (return value)
i : scalar integer
defining which distribution parameter to keep fixed in the
profiling process (default first non-fixed parameter)
pmin, pmax : real scalars
Interval for the parameter, x, used in the
optimization of the profile function (default is based on the
100*(1-alpha)% confidence interval computed with the delta method.)
n : scalar integer
Max number of points used in Lp (default 100)
alpha : real scalar
confidence coefficent (default 0.05)
link : function connecting the x-quantile and the survival probability
9 years ago
(sf) with the fixed distribution parameter, i.e.:
self.par[i] = link(x, logsf, self.par, i), where
logsf = log(Prob(X>x;phat)).
(default is fetched from the LINKS dictionary)
Returns
-------
Lp : Profile log-likelihood function with parameters phat given
the data, phat[i] and quantile (x) i.e.,
Lp = max(log(f(phat|data,phat(i),x)))
Member methods
-------------
plot() : Plot profile function with 100(1-alpha)% confidence interval
get_bounds() : Return 100(1-alpha)% confidence interval
Member variables
----------------
fit_dist : FitDistribution data object.
data : profile function values
args : profile function arguments
alpha : confidence coefficient
Lmax : Maximum value of profile function
alpha_cross_level :
9 years ago
ProfileQuantile is a utility function for making inferences on the
quantile, x.
This is usually more accurate than using the delta method assuming
asymptotic normality of the ML estimator or the MPS estimator.
Examples
--------
# MLE
>>> import wafo.stats as ws
>>> R = ws.weibull_min.rvs(1,size=100);
>>> phat = FitDistribution(ws.weibull_min, R, 1, scale=1, floc=0.0)
>>> sf = 1./990
>>> x = phat.isf(sf)
# 80% CI for x
9 years ago
>>> profile_x = ProfileQuantile(phat, x)
>>> profile_x.plot()
>>> x_ci = profile_x.get_bounds(alpha=0.2)
'''
def __init__(self, fit_dist, x, i=None, pmin=None, pmax=None, n=100,
alpha=0.05, link=None):
self.x = x
self.log_sf = fit_dist.logsf(x)
if link is None:
link = LINKS.get(fit_dist.dist.name)
self.link = link
super(ProfileQuantile, self).__init__(fit_dist, i=i, pmin=pmin,
pmax=pmax, n=100, alpha=alpha)
def _get_p_opt(self):
return self.x
def _local_link(self, fixed_x, par):
"""
Return fixed distribution parameter from fixed quantile
"""
fix_par = self.link(fixed_x, self.log_sf, par, self.i_fixed)
return fix_par
def _myinvfun(self, phatnotfixed):
mphat = self._par.copy()
mphat[self.i_notfixed] = phatnotfixed
prb = exp(self.log_sf)
return self.fit_dist.dist.isf(prb, *mphat)
def _get_variance(self):
i_notfixed = self.i_notfixed
phatv = self._par
gradfun = nd.Gradient(self._myinvfun)
drl = gradfun(phatv[self.i_notfixed])
pcov = self.fit_dist.par_cov[i_notfixed, :][:, i_notfixed]
pvar = np.sum(np.dot(drl, pcov) * drl)
return pvar
def _set_plot_labels(self, fit_dist, method):
super(ProfileQuantile, self)._set_plot_labels(fit_dist, method)
percent = 100 * (1.0 - self.alpha)
self.title = '{:g}% CI for {:s} quantile'.format(percent,
fit_dist.dist.name)
self.xlabel = 'x'
class ProfileProbability(Profile):
''' Profile Log- likelihood or Product Spacing-function probability.
Parameters
----------
fit_dist : FitDistribution object
with ML or MPS estimated distribution parameters.
logsf : real scalar
logarithm of survival probability
i : scalar integer
defining which distribution parameter to keep fixed in the
profiling process (default first non-fixed parameter)
pmin, pmax : real scalars
Interval for the parameter, log_sf, used in the
optimization of the profile function (default is based on the
100*(1-alpha)% confidence interval computed with the delta method.)
n : scalar integer
Max number of points used in Lp (default 100)
alpha : real scalar
confidence coefficent (default 0.05)
link : function connecting the x-quantile and the survival probability
9 years ago
(sf) with the fixed distribution parameter, i.e.:
self.par[i] = link(x, logsf, self.par, i), where
logsf = log(Prob(X>x;phat)).
(default is fetched from the LINKS dictionary)
Returns
-------
Lp : Profile log-likelihood function with parameters phat given
the data, phat[i] and quantile (x) i.e.,
Lp = max(log(f(phat|data,phat(i),x)))
Member methods
-------------
plot() : Plot profile function with 100(1-alpha)% confidence interval
get_bounds() : Return 100(1-alpha)% confidence interval
Member variables
----------------
fit_dist : FitDistribution data object.
data : profile function values
args : profile function arguments
alpha : confidence coefficient
Lmax : Maximum value of profile function
alpha_cross_level :
9 years ago
ProfileProbability is a utility function for making inferences the survival
probability (sf).
This is usually more accurate than using the delta method assuming
asymptotic normality of the ML estimator or the MPS estimator.
Examples
--------
# MLE
>>> import wafo.stats as ws
>>> R = ws.weibull_min.rvs(1,size=100);
>>> phat = FitDistribution(ws.weibull_min, R, 1, scale=1, floc=0.0)
>>> sf = 1./990
# 80% CI for sf
9 years ago
>>> profile_logsf = ProfileProbability(phat, np.log(sf))
>>> profile_logsf.plot()
>>> logsf_ci = profile_logsf.get_bounds(alpha=0.2)
'''
def __init__(self, fit_dist, logsf, i=None, pmin=None, pmax=None, n=100,
alpha=0.05, link=None):
self.x = fit_dist.isf(np.exp(logsf))
self.log_sf = logsf
if link is None:
link = LINKS.get(fit_dist.dist.name)
self.link = link
super(ProfileProbability, self).__init__(fit_dist, i=i, pmin=pmin,
pmax=pmax, n=100, alpha=alpha)
def _get_p_opt(self):
return self.log_sf
def _local_link(self, fixed_log_sf, par):
"""
Return fixed distribution parameter from fixed log_sf
"""
fix_par = self.link(self.x, fixed_log_sf, par, self.i_fixed)
return fix_par
15 years ago
def _myprbfun(self, phatnotfixed):
mphat = self._par.copy()
mphat[self.i_notfixed] = phatnotfixed
9 years ago
logsf = self.fit_dist.dist.logsf(self.x, *mphat)
return np.where(np.isfinite(logsf), logsf, np.nan)
def _get_variance(self):
i_notfixed = self.i_notfixed
phatv = self._par
gradfun = nd.Gradient(self._myprbfun)
drl = gradfun(phatv[self.i_notfixed])
pcov = self.fit_dist.par_cov[i_notfixed, :][:, i_notfixed]
pvar = np.sum(np.dot(drl, pcov) * drl)
return pvar
def _set_plot_labels(self, fit_dist, method):
super(ProfileProbability, self)._set_plot_labels(fit_dist, method)
percent = 100 * (1.0 - self.alpha)
self.title = '{:g}% CI for {:s} probability'.format(percent,
fit_dist.dist.name)
self.xlabel = 'log(sf)'
class FitDistribution(rv_frozen):
'''
Return estimators to shape, location, and scale from data
Starting points for the fit are given by input arguments. For any
arguments not given starting points, dist._fitstart(data) is called
to get the starting estimates.
You can hold some parameters fixed to specific values by passing in
keyword arguments f0..fn for shape paramters and floc, fscale for
location and scale parameters.
Parameters
----------
dist : scipy distribution object
distribution to fit to data
data : array-like
Data to use in calculating the ML or MPS estimators
args : optional
Starting values for any shape arguments (those not specified
will be determined by dist._fitstart(data))
kwds : loc, scale
Starting values for the location and scale parameters
Special keyword arguments are recognized as holding certain
parameters fixed:
f0..fn : hold respective shape paramters fixed
floc : hold location parameter fixed to specified value
fscale : hold scale parameter fixed to specified value
method : of estimation. Options are
'ml' : Maximum Likelihood method (default)
'mps': Maximum Product Spacing method
alpha : scalar, optional
Confidence coefficent (default=0.05)
search : bool
If true search for best estimator (default),
otherwise return object with initial distribution parameters
copydata : bool
If true copydata (default)
optimizer : The optimizer to use. The optimizer must take func,
and starting position as the first two arguments,
plus args (for extra arguments to pass to the
function to be optimized) and disp=0 to suppress
output as keyword arguments.
Return
------
phat : FitDistribution object
Fitted distribution object with following member variables:
LLmax : loglikelihood function evaluated using par
LPSmax : log product spacing function evaluated using par
pvalue : p-value for the fit
par : distribution parameters (fixed and fitted)
par_cov : covariance of distribution parameters
par_fix : fixed distribution parameters
par_lower : lower (1-alpha)% confidence bound for the parameters
par_upper : upper (1-alpha)% confidence bound for the parameters
Note
----
`data` is sorted using this function, so if `copydata`==False the data
in your namespace will be sorted as well.
Examples
--------
Estimate distribution parameters for weibull_min distribution.
>>> import wafo.stats as ws
>>> R = ws.weibull_min.rvs(1,size=100);
>>> phat = FitDistribution(ws.weibull_min, R, 1, scale=1, floc=0.0)
9 years ago
# Plot various diagnostic plots to asses quality of fit.
>>> phat.plotfitsummary()
9 years ago
# phat.par holds the estimated parameters
# phat.par_upper upper CI for parameters
# phat.par_lower lower CI for parameters
9 years ago
# Better 90% CI for phat.par[0]
>>> profile_phat_i = phat.profile(i=0)
>>> profile_phat_i.plot()
>>> p_ci = profile_phat_i.get_bounds(alpha=0.1)
9 years ago
>>> sf = 1./990
>>> x = phat.isf(sf)
9 years ago
# 80% CI for x
>>> profile_x = phat.profile_quantile(x=x)
>>> profile_x.plot()
>>> x_ci = profile_x.get_bounds(alpha=0.2)
# 80% CI for logsf=log(sf)
>>> profile_logsf = phat.profile_probability(log(sf))
>>> profile_logsf.plot()
>>> sf_ci = profile_logsf.get_bounds(alpha=0.2)
'''
def __init__(self, dist, data, args=(), method='ML', alpha=0.05,
search=True, copydata=True, **kwds):
extradoc = '''
plotfitsummary()
Plot various diagnostic plots to asses quality of fit.
plotecdf()
Plot Empirical and fitted Cumulative Distribution Function
plotesf()
Plot Empirical and fitted Survival Function
plotepdf()
Plot Empirical and fitted Probability Distribution Function
plotresq()
Displays a residual quantile plot.
plotresprb()
Displays a residual probability plot.
profile()
Return Profile Log- likelihood or Product Spacing-function.
Parameters
----------
x : array-like
quantiles
q : array-like
lower or upper tail probability
size : int or tuple of ints, optional
shape of random variates (default computed from input arguments )
moments : str, optional
composed of letters ['mvsk'] specifying which moments to compute where
'm' = mean, 'v' = variance, 's' = (Fisher's) skew and
'k' = (Fisher's) kurtosis. (default='mv')
'''
# Member variables
# ----------------
# data - data used in fitting
# alpha - confidence coefficient
# method - method used
# LLmax - loglikelihood function evaluated using par
# LPSmax - log product spacing function evaluated using par
# pvalue - p-value for the fit
# search - True if search for distribution parameters (default)
# copydata - True if copy input data (default)
#
# par - parameters (fixed and fitted)
# par_cov - covariance of parameters
# par_fix - fixed parameters
# par_lower - lower (1-alpha)% confidence bound for the parameters
# par_upper - upper (1-alpha)% confidence bound for the parameters
#
# '''
self.__doc__ = str(rv_frozen.__doc__) + extradoc
self.dist = dist
self.method = method
self.alpha = alpha
self.par_fix = None
self.search = search
self.copydata = copydata
self.data = np.ravel(data)
if self.copydata:
self.data = self.data.copy()
self.data.sort()
if isinstance(args, (float, int)):
args = (args, )
self.fit(*args, **kwds)
def _set_fixed_par(self, fixedn):
self.par_fix = [nan] * len(self.par)
for i in fixedn:
self.par_fix[i] = self.par[i]
self.i_notfixed = nonzero(1 - isfinite(self.par_fix))
self.i_fixed = nonzero(isfinite(self.par_fix))
def fit(self, *args, **kwds):
par, fixedn = self._fit(*args, **kwds.copy())
super(FitDistribution, self).__init__(self.dist, *par)
self.par = arr(par)
somefixed = len(fixedn) > 0
if somefixed:
self._set_fixed_par(fixedn)
self.par_cov = self._compute_cov()
# Set confidence interval for parameters
pvar = np.diag(self.par_cov)
zcrit = -norm_ppf(self.alpha / 2.0)
self.par_lower = self.par - zcrit * sqrt(pvar)
self.par_upper = self.par + zcrit * sqrt(pvar)
self.LLmax = -self._nnlf(self.par, self.data)
self.LPSmax = -self._nlogps(self.par, self.data)
self.pvalue = self._pvalue(self.par, self.data,
unknown_numpar=len(par)-len(fixedn))
@property
def method(self):
return self._method
@method.setter
def method(self, method):
self._method = method.lower()
if self._method.startswith('mps'):
self._fitfun = self._nlogps
else:
self._fitfun = self._nnlf
def __repr__(self):
params = ['alpha', 'method', 'LLmax', 'LPSmax', 'pvalue',
'par', 'par_lower', 'par_upper', 'par_fix', 'par_cov']
t = ['%s:\n' % self.__class__.__name__]
for par in params:
t.append('%s = %s\n' % (par, str(getattr(self, par))))
return ''.join(t)
def _reduce_func(self, args, kwds):
# First of all, convert fshapes params to fnum: eg for stats.beta,
# shapes='a, b'. To fix `a`, can specify either `f1` or `fa`.
# Convert the latter into the former.
shapes = self.dist.shapes
if shapes:
shapes = shapes.replace(',', ' ').split()
for j, s in enumerate(shapes):
val = kwds.pop('f' + s, None) or kwds.pop('fix_' + s, None)
if val is not None:
key = 'f%d' % j
if key in kwds:
raise ValueError("Duplicate entry for %s." % key)
else:
kwds[key] = val
args = list(args)
9 years ago
nargs = len(args)
fixedn = []
9 years ago
names = ['f%d' % n for n in range(nargs - 2)] + ['floc', 'fscale']
x0 = []
for n, key in enumerate(names):
if key in kwds:
fixedn.append(n)
args[n] = kwds.pop(key)
else:
x0.append(args[n])
fitfun = self._fitfun
if len(fixedn) == 0:
func = fitfun
restore = None
else:
9 years ago
if len(fixedn) == nargs:
raise ValueError("All parameters fixed. " +
"There is nothing to optimize.")
def restore(args, theta):
# Replace with theta for all numbers not in fixedn
# This allows the non-fixed values to vary, but
# we still call self.nnlf with all parameters.
i = 0
9 years ago
for n in range(nargs):
if n not in fixedn:
args[n] = theta[i]
i += 1
return args
def func(theta, x):
newtheta = restore(args[:], theta)
return fitfun(newtheta, x)
return x0, func, restore, args, fixedn
@staticmethod
def _hessian(nnlf, theta, data, eps=None):
''' approximate hessian of nnlf where theta are the parameters
(including loc and scale)
'''
if eps is None:
9 years ago
eps = (_EPS) ** 0.25
num_par = len(theta)
# pab 07.01.2001: Always choose the stepsize h so that
# it is an exactly representable number.
# This is important when calculating numerical derivatives and is
# accomplished by the following.
delta = (eps + 2.0) - 2.0
delta2 = delta ** 2.0
# Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with
# 1/(d^2 L(theta|x)/dtheta^2)
# using central differences
LL = nnlf(theta, data)
H = zeros((num_par, num_par)) # Hessian matrix
theta = tuple(theta)
for ix in range(num_par):
sparam = list(theta)
sparam[ix] = theta[ix] + delta
fp = nnlf(sparam, data)
sparam[ix] = theta[ix] - delta
fm = nnlf(sparam, data)
H[ix, ix] = (fp - 2 * LL + fm) / delta2
for iy in range(ix + 1, num_par):
sparam[ix] = theta[ix] + delta
sparam[iy] = theta[iy] + delta
fpp = nnlf(sparam, data)
sparam[iy] = theta[iy] - delta
fpm = nnlf(sparam, data)
sparam[ix] = theta[ix] - delta
fmm = nnlf(sparam, data)
sparam[iy] = theta[iy] + delta
fmp = nnlf(sparam, data)
H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2)
H[iy, ix] = H[ix, iy]
sparam[iy] = theta[iy]
return -H
def _nnlf(self, theta, x):
9 years ago
return self.dist._penalized_nnlf(theta, x, self._penalty)
def _nlogps(self, theta, x):
""" Moran's negative log Product Spacings statistic
where theta are the parameters (including loc and scale)
Note the data in x must be sorted
References
-----------
R. C. H. Cheng; N. A. K. Amin (1983)
"Estimating Parameters in Continuous Univariate Distributions with a
Shifted Origin.",
Journal of the Royal Statistical Society. Series B (Methodological),
Vol. 45, No. 3. (1983), pp. 394-403.
R. C. H. Cheng; M. A. Stephens (1989)
"A Goodness-Of-Fit Test Using Moran's Statistic with Estimated
Parameters", Biometrika, 76, 2, pp 385-392
Wong, T.S.T. and Li, W.K. (2006)
"A note on the estimation of extreme value distributions using maximum
product of spacings.",
IMS Lecture Notes Monograph Series 2006, Vol. 52, pp. 272-283
"""
n = 2 if isinstance(self.dist, rv_continuous) else 1
try:
loc = theta[-n]
scale = theta[-1]
args = tuple(theta[:-n])
except IndexError:
raise ValueError("Not enough input arguments.")
if not isinstance(self.dist, rv_continuous):
scale = 1
if not self.dist._argcheck(*args) or scale <= 0:
return np.inf
dist = self.dist
x = asarray((x - loc) / scale)
9 years ago
cond0 = (x < dist.a) | (dist.b < x)
Nbad = np.sum(cond0)
if Nbad > 0:
x = argsreduce(~cond0, x)[0]
lowertail = True
if lowertail:
prb = np.hstack((0.0, dist.cdf(x, *args), 1.0))
dprb = np.diff(prb)
else:
prb = np.hstack((1.0, dist.sf(x, *args), 0.0))
dprb = -np.diff(prb)
logD = log(dprb + _XMIN)
dx = np.diff(x, axis=0)
tie = (dx == 0)
if np.any(tie):
# TODO : implement this method for treating ties in data:
# Assume measuring error is delta. Then compute
9 years ago
# yL = F(xi - delta, theta)
# yU = F(xi + delta, theta)
# and replace
# logDj = log((yU-yL)/(r-1)) for j = i+1,i+2,...i+r-1
# The following is OK when only minimization of T is wanted
i_tie, = np.nonzero(tie)
tiedata = x[i_tie]
logD[i_tie + 1] = log(dist._pdf(tiedata, *args)) - log(scale)
finiteD = np.isfinite(logD)
nonfiniteD = 1 - finiteD
Nbad += np.sum(nonfiniteD, axis=0)
if Nbad > 0:
9 years ago
if self._penalty is None:
penalty = 100.0 * log(_XMAX) * Nbad
else:
penalty = 0.0
return -np.sum(logD[finiteD], axis=0) + penalty
return -np.sum(logD, axis=0)
9 years ago
@staticmethod
def _get_optimizer(kwds):
optimizer = kwds.pop('optimizer', optimize.fmin)
# convert string to function in scipy.optimize
if not callable(optimizer) and isinstance(optimizer, string_types):
if not optimizer.startswith('fmin_'):
optimizer = '_'.join(("fmin_", optimizer))
try:
optimizer = getattr(optimize, optimizer)
except AttributeError:
raise ValueError("%s is not a valid optimizer" % optimizer)
return optimizer
def _fit_start(self, data, args, kwds):
dist = self.dist
9 years ago
narg = len(args)
if narg > dist.numargs + 2:
raise ValueError("Too many input arguments.")
9 years ago
if (narg < dist.numargs + 2) or not ('loc' in kwds and
'scale' in kwds):
# get distribution specific starting locations
start = dist._fitstart(data)
9 years ago
args += start[narg:]
9 years ago
loc = kwds.pop('loc', args[-2])
scale = kwds.pop('scale', args[-1])
args = args[:-2] + (loc, scale)
return args
@staticmethod
def _warn_if_no_success(warnflag):
if warnflag == 1:
warnings.warn("The maximum number of iterations was exceeded.")
elif warnflag == 2:
warnings.warn("Did not converge")
def _fit(self, *args, **kwds):
self._penalty = None
data = self.data
args = self._fit_start(data, args, kwds)
x0, func, restore, args, fixedn = self._reduce_func(args, kwds)
if self.search:
optimizer = self._get_optimizer(kwds)
# by now kwds must be empty, since everybody took what they needed
if kwds:
raise TypeError("Unknown arguments: %s." % kwds)
output = optimizer(func, x0, args=(data,), full_output=True,
disp=0)
9 years ago
if output[-1] != 0:
output = optimizer(func, output[0], args=(data,),
full_output=True)
9 years ago
vals = tuple(output[0])
self._warn_if_no_success(output[-1])
else:
vals = tuple(x0)
if restore is not None:
vals = restore(args, vals)
return vals, fixedn
def _compute_cov(self):
'''Compute covariance
'''
numpar = self.dist.numargs + 2
par_cov = zeros((numpar, numpar))
9 years ago
# self._penalty = 0
somefixed = ((self.par_fix is not None) and
np.any(isfinite(self.par_fix)))
9 years ago
H = np.asmatrix(self._hessian(self._fitfun, self.par, self.data))
9 years ago
# H = -nd.Hessian(lambda par: self._fitfun(par, self.data),
# method='forward')(self.par)
self.H = H
try:
if somefixed:
allfixed = np.all(isfinite(self.par_fix))
if allfixed:
self.par_cov[:, :] = 0
else:
pcov = -pinv2(H[self.i_notfixed, :][..., self.i_notfixed])
for row, ix in enumerate(list(self.i_notfixed)):
par_cov[ix, self.i_notfixed] = pcov[row, :]
else:
par_cov = -pinv2(H)
except:
par_cov[:, :] = nan
return par_cov
15 years ago
def fitfun(self, phat):
return self._fitfun(phat, self.data)
15 years ago
def profile(self, **kwds):
'''
Profile Log- likelihood or Log Product Spacing- function for phat[i]
Examples
--------
# MLE
>>> import wafo.stats as ws
>>> R = ws.weibull_min.rvs(1,size=100);
>>> phat = FitDistribution(ws.weibull_min, R, 1, scale=1, floc=0.0)
# Better CI for phat.par[i=0]
>>> Lp = phat.profile(i=0)
>>> Lp.plot()
>>> phat_ci = Lp.get_bounds(alpha=0.1)
See also
--------
Profile
'''
return Profile(self, **kwds)
def profile_quantile(self, x, **kwds):
'''
Profile Log- likelihood or Product Spacing-function for quantile.
Examples
--------
# MLE
>>> import wafo.stats as ws
>>> R = ws.weibull_min.rvs(1,size=100);
>>> phat = FitDistribution(ws.weibull_min, R, 1, scale=1, floc=0.0)
>>> sf = 1./990
>>> x = phat.isf(sf)
# 80% CI for x
9 years ago
>>> profile_x = phat.profile_quantile(x)
>>> profile_x.plot()
>>> x_ci = profile_x.get_bounds(alpha=0.2)
'''
return ProfileQuantile(self, x, **kwds)
def profile_probability(self, log_sf, **kwds):
'''
Profile Log- likelihood or Product Spacing-function for probability.
Examples
--------
# MLE
>>> import wafo.stats as ws
>>> R = ws.weibull_min.rvs(1,size=100);
>>> phat = FitDistribution(ws.weibull_min, R, 1, scale=1, floc=0.0)
>>> log_sf = np.log(1./990)
# 80% CI for log_sf
9 years ago
>>> profile_logsf = phat.profile_probability(log_sf)
>>> profile_logsf.plot()
>>> log_sf_ci = profile_logsf.get_bounds(alpha=0.2)
'''
return ProfileProbability(self, log_sf, **kwds)
def ci_sf(self, sf, alpha=0.05, i=2):
ci = []
for log_sfi in np.atleast_1d(np.log(sf)).ravel():
try:
Lp = self.profile_probability(log_sfi, i=i)
ci.append(np.exp(Lp.get_bounds(alpha=alpha)))
except Exception:
ci.append((np.nan, np.nan))
return np.array(ci)
def ci_quantile(self, x, alpha=0.05, i=2):
ci = []
for xi in np.atleast_1d(x).ravel():
try:
Lx = self.profile_quantile(xi, i=2)
ci.append(Lx.get_bounds(alpha=alpha))
except Exception:
ci.append((np.nan, np.nan))
return np.array(ci)
def plotfitsummary(self, axes=None, fig=None):
''' Plot various diagnostic plots to asses the quality of the fit.
PLOTFITSUMMARY displays probability plot, density plot, residual
quantile plot and residual probability plot.
The purpose of these plots is to graphically assess whether the data
could come from the fitted distribution. If so the empirical- CDF and
PDF should follow the model and the residual plots will be linear.
Other distribution types will introduce curvature in the residual
plots.
'''
if axes is None:
fig, axes = plt.subplots(2, 2, figsize=(11, 8))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
# plt.subplots_adjust(hspace=0.4, wspace=0.4)
# self.plotecdf()
plot_funs = (self.plotesf, self.plotepdf,
self.plotresq, self.plotresprb)
for axis, plot in zip(axes.ravel(), plot_funs):
plot(axis=axis)
if fig is None:
fig = plt.gcf()
fixstr = ''
if self.par_fix is not None:
numfix = len(self.i_fixed)
15 years ago
if numfix > 0:
format0 = ', '.join(['%d'] * numfix)
format1 = ', '.join(['%g'] * numfix)
12 years ago
phatistr = format0 % tuple(self.i_fixed)
phatvstr = format1 % tuple(self.par[self.i_fixed])
9 years ago
fixstr = 'Fixed: phat[{0:s}] = {1:s} '.format(phatistr,
phatvstr)
subtxt = 'Fit method: {0:s}, Fit p-value: {1:2.2f} {2:s}, phat=[{3:s}]'
9 years ago
par_txt = ('{:1.2g}, '*len(self.par))[:-2].format(*self.par)
try:
fig.text(0.05, 0.01, subtxt.format(self.method.upper(),
self.pvalue, fixstr, par_txt))
except Exception:
pass
def plotesf(self, symb1='r-', symb2='b.', axis=None, plot_ci=False):
''' Plot Empirical and fitted Survival Function
The purpose of the plot is to graphically assess whether
the data could come from the fitted distribution.
If so the empirical CDF should resemble the model CDF.
Other distribution types will introduce deviations in the plot.
'''
if axis is None:
axis = plt.gca()
n = len(self.data)
9 years ago
sf = (arange(n, 0, -1)) / n
axis.semilogy(self.data, sf, symb2,
self.data, self.sf(self.data), symb1)
if plot_ci:
low = int(np.log10(1.0/n)-0.7) - 1
sf1 = np.logspace(low, -0.5, 7)[::-1]
ci1 = self.ci_sf(sf, alpha=0.05, i=2)
axis.semilogy(self.isf(sf1), ci1, 'r--')
axis.set_xlabel('x')
axis.set_ylabel('F(x) (%s)' % self.dist.name)
axis.set_title('Empirical SF plot')
def plotecdf(self, symb1='r-', symb2='b.', axis=None):
''' Plot Empirical and fitted Cumulative Distribution Function
The purpose of the plot is to graphically assess whether
the data could come from the fitted distribution.
If so the empirical CDF should resemble the model CDF.
Other distribution types will introduce deviations in the plot.
'''
if axis is None:
axis = plt.gca()
n = len(self.data)
15 years ago
F = (arange(1, n + 1)) / n
axis.plot(self.data, F, symb2,
self.data, self.cdf(self.data), symb1)
axis.set_xlabel('x')
axis.set_ylabel('F(x) ({})'.format(self.dist.name))
axis.set_title('Empirical CDF plot')
def _get_grid(self, odd=False):
x = np.atleast_1d(self.data)
n = np.ceil(4 * np.sqrt(np.sqrt(len(x))))
mn = x.min()
mx = x.max()
d = (mx - mn) / n * 2
e = np.floor(np.log(d) / np.log(10))
m = np.floor(d / 10 ** e)
if m > 5:
m = 5
elif m > 2:
m = 2
d = m * 10 ** e
mn = (np.floor(mn / d) - 1) * d - odd * d / 2
mx = (np.ceil(mx / d) + 1) * d + odd * d / 2
limits = np.arange(mn, mx, d)
return limits
@staticmethod
def _staircase(x, y):
xx = x.reshape(-1, 1).repeat(3, axis=1).ravel()[1:-1]
yy = y.reshape(-1, 1).repeat(3, axis=1)
# yy[0,0] = 0.0 # pdf
yy[:, 0] = 0.0 # histogram
yy.shape = (-1,)
yy = np.hstack((yy, 0.0))
return xx, yy
def _get_empirical_pdf(self):
limits = self._get_grid()
pdf, x = np.histogram(self.data, bins=limits, normed=True)
return self._staircase(x, pdf)
def plotepdf(self, symb1='r-', symb2='b-', axis=None):
'''Plot Empirical and fitted Probability Density Function
The purpose of the plot is to graphically assess whether
the data could come from the fitted distribution.
If so the histogram should resemble the model density.
Other distribution types will introduce deviations in the plot.
'''
if axis is None:
axis = plt.gca()
x, pdf = self._get_empirical_pdf()
ymax = pdf.max()
# axis.hist(self.data,normed=True,fill=False)
axis.plot(self.data, self.pdf(self.data), symb1,
x, pdf, symb2)
axis1 = list(axis.axis())
axis1[3] = min(ymax * 1.3, axis1[3])
axis.axis(axis1)
axis.set_xlabel('x')
axis.set_ylabel('f(x) (%s)' % self.dist.name)
axis.set_title('Density plot')
def plotresq(self, symb1='r-', symb2='b.', axis=None):
'''PLOTRESQ displays a residual quantile plot.
The purpose of the plot is to graphically assess whether
the data could come from the fitted distribution. If so the
plot will be linear. Other distribution types will introduce
curvature in the plot.
'''
if axis is None:
axis = plt.gca()
15 years ago
n = len(self.data)
eprob = (arange(1, n + 1) - 0.5) / n
y = self.ppf(eprob)
15 years ago
y1 = self.data[[0, -1]]
axis.plot(self.data, y, symb2, y1, y1, symb1)
axis.set_xlabel('Empirical')
axis.set_ylabel('Model (%s)' % self.dist.name)
axis.set_title('Residual Quantile Plot')
axis.axis('tight')
axis.axis('equal')
def plotresprb(self, symb1='r-', symb2='b.', axis=None):
''' PLOTRESPRB displays a residual probability plot.
The purpose of the plot is to graphically assess whether
the data could come from the fitted distribution. If so the
plot will be linear. Other distribution types will introduce curvature
in the plot.
'''
if axis is None:
axis = plt.gca()
n = len(self.data)
# ecdf = (0.5:n-0.5)/n;
15 years ago
ecdf = arange(1, n + 1) / (n + 1)
mcdf = self.cdf(self.data)
15 years ago
p1 = [0, 1]
axis.plot(ecdf, mcdf, symb2, p1, p1, symb1)
axis.set_xlabel('Empirical')
axis.set_ylabel('Model (%s)' % self.dist.name)
axis.set_title('Residual Probability Plot')
axis.axis('equal')
axis.axis([0, 1, 0, 1])
def _pvalue(self, theta, x, unknown_numpar=None):
''' Return P-value for the fit using Moran's negative log Product
Spacings statistic
where theta are the parameters (including loc and scale)
Note: the data in x must be sorted
'''
dx = np.diff(x, axis=0)
15 years ago
tie = (dx == 0)
if np.any(tie):
warnings.warn(
'P-value is on the conservative side (i.e. too large) due to' +
' ties in the data!')
T = self.dist.nlogps(theta, x)
n = len(x)
15 years ago
np1 = n + 1
if unknown_numpar is None:
k = len(theta)
else:
k = unknown_numpar
isParUnKnown = True
15 years ago
m = (np1) * (log(np1) + 0.57722) - 0.5 - 1.0 / (12. * (np1))
v = (np1) * (pi ** 2. / 6.0 - 1.0) - 0.5 - 1.0 / (6. * (np1))
C1 = m - sqrt(0.5 * n * v)
C2 = sqrt(v / (2.0 * n))
# chi2 with n degrees of freedom
Tn = (T + 0.5 * k * isParUnKnown - C1) / C2
pvalue = chi2sf(Tn, n) # _WAFODIST.chi2.sf(Tn, n)
return pvalue
12 years ago
def test_doctstrings():
import doctest
doctest.testmod()
def test1():
import wafo.stats as ws
dist = ws.weibull_min
# dist = ws.bradford
# dist = ws.gengamma
R = dist.rvs(2, .5, size=500)
phat = FitDistribution(dist, R, floc=0.5, method='ml')
phats = FitDistribution(dist, R, floc=0.5, method='mps')
# import matplotlib.pyplot as plt
# plt.figure(0)
# plot_all_profiles(phat, plot=plt)
9 years ago
plt.figure(1)
phats.plotfitsummary()
# plt.figure(2)
# plot_all_profiles(phat, plot=plt)
# plt.figure(3)
# phat.plotfitsummary()
plt.figure(4)
sf = 1./990
x = phat.isf(sf)
# 80% CI for x
9 years ago
profile_x = ProfileQuantile(phat, x)
profile_x.plot()
# x_ci = profile_x.get_bounds(alpha=0.2)
plt.figure(5)
sf = 1./990
x = phat.isf(sf)
# 80% CI for x
9 years ago
profile_logsf = ProfileProbability(phat, np.log(sf))
profile_logsf.plot()
# logsf_ci = profile_logsf.get_bounds(alpha=0.2)
9 years ago
plt.show('hold')
15 years ago
if __name__ == '__main__':
test1()
# test_doctstrings()