You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
509 lines
17 KiB
Python
509 lines
17 KiB
Python
from __future__ import absolute_import
|
|
from scipy.stats._distn_infrastructure import * # @UnusedWildImport
|
|
from scipy.stats._distn_infrastructure import (_skew, # @UnusedImport
|
|
_kurtosis, _ncx2_log_pdf, # @IgnorePep8 @UnusedImport
|
|
_ncx2_pdf, _ncx2_cdf) # @UnusedImport @IgnorePep8
|
|
from .estimation import FitDistribution, rv_frozen # @Reimport
|
|
from ._constants import _XMAX, _XMIN
|
|
from wafo.misc import lazyselect as _lazyselect # @UnusedImport
|
|
from wafo.misc import lazywhere as _lazywhere # @UnusedImport
|
|
|
|
_doc_default_example = """\
|
|
Examples
|
|
--------
|
|
>>> from wafo.stats import %(name)s
|
|
>>> import matplotlib.pyplot as plt
|
|
>>> fig, ax = plt.subplots(1, 1)
|
|
|
|
Calculate a few first moments:
|
|
|
|
%(set_vals_stmt)s
|
|
>>> mean, var, skew, kurt = %(name)s.stats(%(shapes)s, moments='mvsk')
|
|
|
|
Display the probability density function (``pdf``):
|
|
|
|
>>> x = np.linspace(%(name)s.ppf(0.01, %(shapes)s),
|
|
... %(name)s.ppf(0.99, %(shapes)s), 100)
|
|
>>> ax.plot(x, %(name)s.pdf(x, %(shapes)s),
|
|
... 'r-', lw=5, alpha=0.6, label='%(name)s pdf')
|
|
|
|
Alternatively, the distribution object can be called (as a function)
|
|
to fix the shape, location and scale parameters. This returns a "frozen"
|
|
RV object holding the given parameters fixed.
|
|
|
|
Freeze the distribution and display the frozen ``pdf``:
|
|
|
|
>>> rv = %(name)s(%(shapes)s)
|
|
>>> ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
|
|
|
|
Check accuracy of ``cdf`` and ``ppf``:
|
|
|
|
>>> vals = %(name)s.ppf([0.001, 0.5, 0.999], %(shapes)s)
|
|
>>> np.allclose([0.001, 0.5, 0.999], %(name)s.cdf(vals, %(shapes)s))
|
|
True
|
|
|
|
Generate random numbers:
|
|
|
|
>>> r = %(name)s.rvs(%(shapes)s, size=1000)
|
|
|
|
And compare the histogram:
|
|
|
|
>>> ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
|
|
>>> ax.legend(loc='best', frameon=False)
|
|
>>> plt.show()
|
|
|
|
Compare ML and MPS method
|
|
>>> phat = %(name)s.fit2(R, method='ml');>>> phat.plotfitsummary()
|
|
>>> plt.figure(plt.gcf().number+1)
|
|
>>> phat2 = %(name)s.fit2(R, method='mps')
|
|
>>> phat2.plotfitsummary(); plt.figure(plt.gcf().number+1)
|
|
|
|
Fix loc=0 and estimate shapes and scale
|
|
>>> phat3 = %(name)s.fit2(R, scale=1, floc=0, method='mps')
|
|
>>> phat3.plotfitsummary(); plt.figure(plt.gcf().number+1)
|
|
|
|
Accurate confidence interval with profile loglikelihood
|
|
>>> lp = phat3.profile()
|
|
>>> lp.plot()
|
|
>>> pci = lp.get_bounds()
|
|
|
|
"""
|
|
|
|
|
|
def freeze(self, *args, **kwds):
|
|
"""Freeze the distribution for the given arguments.
|
|
|
|
Parameters
|
|
----------
|
|
arg1, arg2, arg3,... : array_like
|
|
The shape parameter(s) for the distribution. Should include all
|
|
the non-optional arguments, may include ``loc`` and ``scale``.
|
|
|
|
Returns
|
|
-------
|
|
rv_frozen : rv_frozen instance
|
|
The frozen distribution.
|
|
|
|
"""
|
|
return rv_frozen(self, *args, **kwds)
|
|
|
|
|
|
def _resolve_ties(self, log_dprb, x, args, scale):
|
|
dx = np.diff(x, axis=0)
|
|
tie = dx == 0
|
|
if any(tie): # TODO : implement this method for treating ties in data:
|
|
# Assume measuring error is delta. Then compute
|
|
# yL = F(xi-delta,theta)
|
|
# yU = F(xi+delta,theta)
|
|
# and replace
|
|
# logDj = log((yU-yL)/(r-1)) for j = i+1,i+2,...i+r-1
|
|
# The following is OK when only minimization of T is wanted
|
|
i_tie, = np.nonzero(tie)
|
|
log_dprb[i_tie + 1] = log(self._pdf(x[i_tie], *args)) - log(scale)
|
|
return log_dprb
|
|
|
|
|
|
def _log_dprb(self, x, args, scale, lowertail=True):
|
|
|
|
if lowertail:
|
|
prb = np.hstack((0.0, self.cdf(x, *args), 1.0))
|
|
dprb = np.diff(prb)
|
|
else:
|
|
prb = np.hstack((1.0, self.sf(x, *args), 0.0))
|
|
dprb = -np.diff(prb)
|
|
log_dprb = log(dprb + _XMIN)
|
|
log_dprb = _resolve_ties(self, log_dprb, x, args, scale)
|
|
return log_dprb
|
|
|
|
|
|
def _nlogps_and_penalty(self, x, scale, args):
|
|
cond0 = ~self._support_mask(x)
|
|
n_bad = np.sum(cond0)
|
|
if n_bad > 0:
|
|
x = argsreduce(~cond0, x)[0]
|
|
log_dprb = _log_dprb(self, x, args, scale)
|
|
finite_log_dprb = np.isfinite(log_dprb)
|
|
n_bad += np.sum(~finite_log_dprb, axis=0)
|
|
if n_bad > 0:
|
|
penalty = 100.0 * np.log(_XMAX) * n_bad
|
|
return -np.sum(log_dprb[finite_log_dprb], axis=0) + penalty
|
|
return -np.sum(log_dprb, axis=0)
|
|
|
|
|
|
def _penalized_nlogps(self, theta, x):
|
|
""" Moran's negative log Product Spacings statistic
|
|
|
|
where theta are the parameters (including loc and scale)
|
|
|
|
Note the data in x must be sorted
|
|
|
|
References
|
|
-----------
|
|
|
|
R. C. H. Cheng; N. A. K. Amin (1983)
|
|
"Estimating Parameters in Continuous Univariate Distributions with a
|
|
Shifted Origin.",
|
|
Journal of the Royal Statistical Society. Series B (Methodological),
|
|
Vol. 45, No. 3. (1983), pp. 394-403.
|
|
|
|
R. C. H. Cheng; M. A. Stephens (1989)
|
|
"A Goodness-Of-Fit Test Using Moran's Statistic with Estimated
|
|
Parameters", Biometrika, 76, 2, pp 385-392
|
|
|
|
Wong, T.S.T. and Li, W.K. (2006)
|
|
"A note on the estimation of extreme value distributions using maximum
|
|
product of spacings.",
|
|
IMS Lecture Notes Monograph Series 2006, Vol. 52, pp. 272-283
|
|
"""
|
|
loc, scale, args = _unpack_loc_scale(theta)
|
|
if not self._argcheck(*args) or scale <= 0:
|
|
return inf
|
|
x = asarray((x - loc) / scale)
|
|
return _nlogps_and_penalty(self, x, scale, args)
|
|
|
|
|
|
def _unpack_loc_scale(theta):
|
|
try:
|
|
loc = theta[-2]
|
|
scale = theta[-1]
|
|
args = tuple(theta[:-2])
|
|
except IndexError:
|
|
raise ValueError("Not enough input arguments.")
|
|
return loc, scale, args
|
|
|
|
|
|
def _nnlf_and_penalty(self, x, args):
|
|
cond0 = ~self._support_mask(x)
|
|
n_bad = sum(cond0)
|
|
if n_bad > 0:
|
|
x = argsreduce(~cond0, x)[0]
|
|
logpdf = self._logpdf(x, *args)
|
|
finite_logpdf = np.isfinite(logpdf)
|
|
n_bad += np.sum(~finite_logpdf, axis=0)
|
|
if n_bad > 0:
|
|
penalty = n_bad * log(_XMAX) * 100
|
|
return -np.sum(logpdf[finite_logpdf], axis=0) + penalty
|
|
return -np.sum(logpdf, axis=0)
|
|
|
|
|
|
def _penalized_nnlf(self, theta, x):
|
|
''' Return negative loglikelihood function,
|
|
i.e., - sum (log pdf(x, theta), axis=0)
|
|
where theta are the parameters (including loc and scale)
|
|
'''
|
|
loc, scale, args = _unpack_loc_scale(theta)
|
|
if not self._argcheck(*args) or scale <= 0:
|
|
return inf
|
|
x = asarray((x-loc) / scale)
|
|
n_log_scale = len(x) * log(scale)
|
|
return _nnlf_and_penalty(self, x, args) + n_log_scale
|
|
|
|
|
|
def _convert_fshapes2num(self, kwds):
|
|
# First of all, convert fshapes params to fnum: eg for stats.beta,
|
|
# shapes='a, b'. To fix `a`, can specify either `f1` or `fa`.
|
|
# Convert the latter into the former.
|
|
if self.shapes:
|
|
shapes = self.shapes.replace(',', ' ').split()
|
|
for j, s in enumerate(shapes):
|
|
val = kwds.pop('f' + s, None) or kwds.pop('fix_' + s, None)
|
|
if val is not None:
|
|
key = 'f{0:d}'.format(j)
|
|
if key in kwds:
|
|
raise ValueError("Duplicate entry for {0:s}.".format(key))
|
|
else:
|
|
kwds[key] = val
|
|
return kwds
|
|
|
|
|
|
def _unpack_args_kwds(self, args, kwds):
|
|
kwds = _convert_fshapes2num(self, kwds)
|
|
args = list(args)
|
|
fixedn = []
|
|
names = ['f%d' % n for n in range(len(args) - 2)] + ['floc', 'fscale']
|
|
x0 = []
|
|
for n, key in enumerate(names):
|
|
if key in kwds:
|
|
fixedn.append(n)
|
|
args[n] = kwds.pop(key)
|
|
else:
|
|
x0.append(args[n])
|
|
return x0, args, fixedn
|
|
|
|
|
|
def _reduce_func(self, args, kwds):
|
|
method = kwds.pop('method', 'ml').lower()
|
|
if method.startswith('mps'):
|
|
fitfun = self._penalized_nlogps
|
|
else:
|
|
fitfun = self._penalized_nnlf
|
|
|
|
x0, args, fixedn = _unpack_args_kwds(self, args, kwds)
|
|
|
|
nargs = len(args)
|
|
|
|
if len(fixedn) == 0:
|
|
func = fitfun
|
|
restore = None
|
|
else:
|
|
if len(fixedn) == nargs:
|
|
raise ValueError(
|
|
"All parameters fixed. There is nothing to optimize.")
|
|
|
|
def restore(args, theta):
|
|
# Replace with theta for all numbers not in fixedn
|
|
# This allows the non-fixed values to vary, but
|
|
# we still call self.nnlf with all parameters.
|
|
i = 0
|
|
for n in range(nargs):
|
|
if n not in fixedn:
|
|
args[n] = theta[i]
|
|
i += 1
|
|
return args
|
|
|
|
def func(theta, x):
|
|
newtheta = restore(args[:], theta)
|
|
return fitfun(newtheta, x)
|
|
|
|
return x0, func, restore, args, fixedn
|
|
|
|
|
|
def _get_optimizer(kwds):
|
|
optimizer = kwds.pop('optimizer', optimize.fmin)
|
|
# convert string to function in scipy.optimize
|
|
if not callable(optimizer) and isinstance(optimizer, string_types):
|
|
if not optimizer.startswith('fmin_'):
|
|
optimizer = '_'.join(("fmin", optimizer))
|
|
try:
|
|
optimizer = getattr(optimize, optimizer)
|
|
except AttributeError:
|
|
raise ValueError("{} is not a valid optimizer".format(optimizer))
|
|
return optimizer
|
|
|
|
|
|
def _warn_if_no_success(warnflag):
|
|
if warnflag == 1:
|
|
warnings.warn("The maximum number of iterations was exceeded.")
|
|
elif warnflag == 2:
|
|
warnings.warn("Did not converge")
|
|
|
|
|
|
def _fitstart(self, data, args, kwds):
|
|
narg = len(args)
|
|
if narg > self.numargs:
|
|
raise TypeError("Too many input arguments.")
|
|
start = [None] * 2
|
|
if (narg < self.numargs) or not ('loc' in kwds and 'scale' in kwds):
|
|
# get distribution specific starting locations
|
|
start = self._fitstart(data)
|
|
args += start[narg:-2]
|
|
loc = kwds.pop('loc', start[-2])
|
|
scale = kwds.pop('scale', start[-1])
|
|
args += loc, scale
|
|
return args, kwds
|
|
|
|
|
|
def fit(self, data, *args, **kwargs):
|
|
"""
|
|
Return ML/MPS estimate for shape, location, and scale parameters from data.
|
|
|
|
ML and MPS stands for Maximum Likelihood and Maximum Product Spacing,
|
|
respectively. Starting estimates for
|
|
the fit are given by input arguments; for any arguments not provided
|
|
with starting estimates, ``self._fitstart(data)`` is called to generate
|
|
such.
|
|
|
|
One can hold some parameters fixed to specific values by passing in
|
|
keyword arguments ``f0``, ``f1``, ..., ``fn`` (for shape parameters)
|
|
and ``floc`` and ``fscale`` (for location and scale parameters,
|
|
respectively).
|
|
|
|
Parameters
|
|
----------
|
|
data : array_like
|
|
Data to use in calculating the MLEs.
|
|
args : floats, optional
|
|
Starting value(s) for any shape-characterizing arguments (those not
|
|
provided will be determined by a call to ``_fitstart(data)``).
|
|
No default value.
|
|
kwds : floats, optional
|
|
Starting values for the location and scale parameters; no default.
|
|
Special keyword arguments are recognized as holding certain
|
|
parameters fixed:
|
|
|
|
- f0...fn : hold respective shape parameters fixed.
|
|
Alternatively, shape parameters to fix can be specified by name.
|
|
For example, if ``self.shapes == "a, b"``, ``fa``and ``fix_a``
|
|
are equivalent to ``f0``, and ``fb`` and ``fix_b`` are
|
|
equivalent to ``f1``.
|
|
|
|
- floc : hold location parameter fixed to specified value.
|
|
|
|
- fscale : hold scale parameter fixed to specified value.
|
|
|
|
- method : of estimation. Options are
|
|
'ml' : Maximum Likelihood method (default)
|
|
'mps': Maximum Product Spacing method
|
|
|
|
- alpha : Confidence coefficent (default=0.05)
|
|
|
|
- search : bool
|
|
If true search for best estimator (default),
|
|
otherwise return object with initial distribution parameters
|
|
|
|
- optimizer : The optimizer to use. The optimizer must take ``func``,
|
|
and starting position as the first two arguments,
|
|
plus ``args`` (for extra arguments to pass to the
|
|
function to be optimized) and ``disp=0`` to suppress
|
|
output as keyword arguments.
|
|
|
|
Returns
|
|
-------
|
|
shape, loc, scale : tuple of floats
|
|
MLEs for any shape statistics, followed by those for location and
|
|
scale.
|
|
|
|
Notes
|
|
-----
|
|
This fit is computed by maximizing a log-likelihood function, with
|
|
penalty applied for samples outside of range of the distribution. The
|
|
returned answer is not guaranteed to be the globally optimal MLE, it
|
|
may only be locally optimal, or the optimization may fail altogether.
|
|
|
|
|
|
Examples
|
|
--------
|
|
|
|
Generate some data to fit: draw random variates from the `beta`
|
|
distribution
|
|
|
|
>>> from wafo.stats import beta
|
|
>>> a, b = 1., 2.
|
|
>>> x = beta.rvs(a, b, size=1000)
|
|
|
|
Now we can fit all four parameters (``a``, ``b``, ``loc`` and ``scale``):
|
|
|
|
>>> a1, b1, loc1, scale1 = beta.fit(x)
|
|
|
|
We can also use some prior knowledge about the dataset: let's keep
|
|
``loc`` and ``scale`` fixed:
|
|
|
|
>>> a1, b1, loc1, scale1 = beta.fit(x, floc=0, fscale=1)
|
|
>>> loc1, scale1
|
|
(0, 1)
|
|
|
|
We can also keep shape parameters fixed by using ``f``-keywords. To
|
|
keep the zero-th shape parameter ``a`` equal 1, use ``f0=1`` or,
|
|
equivalently, ``fa=1``:
|
|
|
|
>>> a1, b1, loc1, scale1 = beta.fit(x, fa=1, floc=0, fscale=1)
|
|
>>> a1
|
|
1
|
|
|
|
"""
|
|
vals, _ = self._fit(data, *args, **kwargs)
|
|
return vals
|
|
|
|
|
|
def _fit(self, data, *args, **kwargs):
|
|
args, kwds = _fitstart(self, data, args, kwargs.copy())
|
|
x0, func, restore, args, fixedn = self._reduce_func(args, kwds)
|
|
if kwds.pop('search', True):
|
|
optimizer = _get_optimizer(kwds)
|
|
|
|
# by now kwds must be empty, since everybody took what they needed
|
|
if kwds:
|
|
raise TypeError("Unknown arguments: {}.".format(kwds))
|
|
|
|
output = optimizer(func, x0, args=(ravel(data),), full_output=True,
|
|
disp=0)
|
|
if output[-1] != 0:
|
|
output = optimizer(func, output[0], args=(ravel(data),),
|
|
full_output=True)
|
|
|
|
_warn_if_no_success(output[-1])
|
|
vals = tuple(output[0])
|
|
else:
|
|
vals = tuple(x0)
|
|
|
|
if restore is not None:
|
|
vals = restore(args, vals)
|
|
vals = tuple(vals)
|
|
return vals, fixedn
|
|
|
|
|
|
def fit2(self, data, *args, **kwds):
|
|
''' Return Maximum Likelihood or Maximum Product Spacing estimator object
|
|
|
|
Parameters
|
|
----------
|
|
data : array-like
|
|
Data to use in calculating the ML or MPS estimators
|
|
args : optional
|
|
Starting values for any shape arguments (those not specified
|
|
will be determined by dist._fitstart(data))
|
|
kwds : loc, scale
|
|
Starting values for the location and scale parameters
|
|
Special keyword arguments are recognized as holding certain
|
|
parameters fixed:
|
|
f0..fn : hold respective shape paramters fixed
|
|
floc : hold location parameter fixed to specified value
|
|
fscale : hold scale parameter fixed to specified value
|
|
method : of estimation. Options are
|
|
'ml' : Maximum Likelihood method (default)
|
|
'mps': Maximum Product Spacing method
|
|
alpha : scalar, optional
|
|
Confidence coefficent (default=0.05)
|
|
search : bool
|
|
If true search for best estimator (default),
|
|
otherwise return object with initial distribution parameters
|
|
copydata : bool
|
|
If true copydata (default)
|
|
optimizer : The optimizer to use. The optimizer must take func,
|
|
and starting position as the first two arguments,
|
|
plus args (for extra arguments to pass to the
|
|
function to be optimized) and disp=0 to suppress
|
|
output as keyword arguments.
|
|
|
|
Return
|
|
------
|
|
phat : FitDistribution object
|
|
Fitted distribution object with following member variables:
|
|
LLmax : loglikelihood function evaluated using par
|
|
LPSmax : log product spacing function evaluated using par
|
|
pvalue : p-value for the fit
|
|
par : distribution parameters (fixed and fitted)
|
|
par_cov : covariance of distribution parameters
|
|
par_fix : fixed distribution parameters
|
|
par_lower : lower (1-alpha)% confidence bound for the parameters
|
|
par_upper : upper (1-alpha)% confidence bound for the parameters
|
|
|
|
Note
|
|
----
|
|
`data` is sorted using this function, so if `copydata`==False the data
|
|
in your namespace will be sorted as well.
|
|
'''
|
|
return FitDistribution(self, data, args, **kwds)
|
|
|
|
|
|
def _support_mask(self, x):
|
|
return (self.a <= x) & (x <= self.b)
|
|
|
|
|
|
def _open_support_mask(self, x):
|
|
return (self.a < x) & (x < self.b)
|
|
|
|
|
|
rv_generic.freeze = freeze
|
|
rv_discrete.freeze = freeze
|
|
rv_continuous.freeze = freeze
|
|
|
|
rv_continuous._penalized_nlogps = _penalized_nlogps
|
|
rv_continuous._penalized_nnlf = _penalized_nnlf
|
|
rv_continuous._reduce_func = _reduce_func
|
|
rv_continuous.fit = fit
|
|
rv_continuous._fit = _fit
|
|
rv_continuous.fit2 = fit2
|
|
rv_continuous._support_mask = _support_mask
|
|
rv_continuous._open_support_mask = _open_support_mask
|