From e6ed2f4d3f885486960f09e830e9915d56b31dbb Mon Sep 17 00:00:00 2001 From: Per A Brodtkorb Date: Tue, 3 Jan 2017 01:14:22 +0100 Subject: [PATCH] Simplified things. --- wafo/stats/_distn_infrastructure.py | 178 +------------- wafo/stats/estimation.py | 350 +++++++++++++++++++--------- 2 files changed, 245 insertions(+), 283 deletions(-) diff --git a/wafo/stats/_distn_infrastructure.py b/wafo/stats/_distn_infrastructure.py index 67649a3..7f31b6c 100644 --- a/wafo/stats/_distn_infrastructure.py +++ b/wafo/stats/_distn_infrastructure.py @@ -3,10 +3,10 @@ from scipy.stats._distn_infrastructure import * # @UnusedWildImport from scipy.stats._distn_infrastructure import (_skew, # @UnusedImport _kurtosis, _ncx2_log_pdf, # @IgnorePep8 @UnusedImport _ncx2_pdf, _ncx2_cdf) # @UnusedImport @IgnorePep8 -from .estimation import FitDistribution +from .estimation import FitDistribution, rv_frozen # @Reimport from ._constants import _XMAX, _XMIN -from wafo.misc import lazyselect as _lazyselect -from wafo.misc import lazywhere as _lazywhere +from wafo.misc import lazyselect as _lazyselect # @UnusedImport +from wafo.misc import lazywhere as _lazywhere # @UnusedImport _doc_default_example = """\ Examples @@ -70,137 +70,6 @@ Accurate confidence interval with profile loglikelihood """ -# Frozen RV class -class rv_frozen(object): - ''' Frozen continous or discrete 1D Random Variable object (RV) - - Methods - ------- - rvs(size=1) - Random variates. - pdf(x) - Probability density function. - cdf(x) - Cumulative density function. - sf(x) - Survival function (1-cdf --- sometimes more accurate). - ppf(q) - Percent point function (inverse of cdf --- percentiles). - isf(q) - Inverse survival function (inverse of sf). - stats(moments='mv') - Mean('m'), variance('v'), skew('s'), and/or kurtosis('k'). - moment(n) - n-th order non-central moment of distribution. - entropy() - (Differential) entropy of the RV. - interval(alpha) - Confidence interval with equal areas around the median. - expect(func, lb, ub, conditional=False) - Calculate expected value of a function with respect to the - distribution. - ''' - def __init__(self, dist, *args, **kwds): - # create a new instance - self.dist = dist # .__class__(**dist._ctor_param) - shapes, loc, scale = self.dist._parse_args(*args, **kwds) - if isinstance(dist, rv_continuous): - self.par = shapes + (loc, scale) - else: # rv_discrete - self.par = shapes + (loc,) - self.a = self.dist.a - self.b = self.dist.b - self.shapes = self.dist.shapes - # @property - # def shapes(self): - # return self.dist.shapes - - @property - def random_state(self): - return self.dist._random_state - - @random_state.setter - def random_state(self, seed): - self.dist._random_state = check_random_state(seed) - - def pdf(self, x): - ''' Probability density function at x of the given RV.''' - return self.dist.pdf(x, *self.par) - - def logpdf(self, x): - return self.dist.logpdf(x, *self.par) - - def cdf(self, x): - '''Cumulative distribution function at x of the given RV.''' - return self.dist.cdf(x, *self.par) - - def logcdf(self, x): - return self.dist.logcdf(x, *self.par) - - def ppf(self, q): - '''Percent point function (inverse of cdf) at q of the given RV.''' - return self.dist.ppf(q, *self.par) - - def isf(self, q): - '''Inverse survival function at q of the given RV.''' - return self.dist.isf(q, *self.par) - - def rvs(self, size=None, random_state=None): - kwds = {'size': size, 'random_state': random_state} - return self.dist.rvs(*self.par, **kwds) - - def sf(self, x): - '''Survival function (1-cdf) at x of the given RV.''' - return self.dist.sf(x, *self.par) - - def logsf(self, x): - return self.dist.logsf(x, *self.par) - - def stats(self, moments='mv'): - ''' Some statistics of the given RV''' - kwds = dict(moments=moments) - return self.dist.stats(*self.par, **kwds) - - def median(self): - return self.dist.median(*self.par) - - def mean(self): - return self.dist.mean(*self.par) - - def var(self): - return self.dist.var(*self.par) - - def std(self): - return self.dist.std(*self.par) - - def moment(self, n): - return self.dist.moment(n, *self.par) - - def entropy(self): - return self.dist.entropy(*self.par) - - def pmf(self, k): - '''Probability mass function at k of the given RV''' - return self.dist.pmf(k, *self.par) - - def logpmf(self, k): - return self.dist.logpmf(k, *self.par) - - def interval(self, alpha): - return self.dist.interval(alpha, *self.par) - - def expect(self, func=None, lb=None, ub=None, conditional=False, **kwds): - if isinstance(self.dist, rv_continuous): - a, loc, scale = self.par[:-2], self.par[:-2], self.par[-1] - return self.dist.expect(func, a, loc, scale, lb, ub, conditional, - **kwds) - - a, loc = self.par[:-1], self.par[-1] - if kwds: - raise ValueError("Discrete expect does not accept **kwds.") - return self.dist.expect(func, a, loc, lb, ub, conditional) - - def freeze(self, *args, **kwds): """Freeze the distribution for the given arguments. @@ -219,41 +88,6 @@ def freeze(self, *args, **kwds): return rv_frozen(self, *args, **kwds) -# def link(self, x, logSF, theta, i): -# ''' -# Return theta[i] as function of quantile, survival probability and -# theta[j] for j!=i. -# -# Parameters -# ---------- -# x : quantile -# logSF : logarithm of the survival probability -# theta : list -# all distribution parameters including location and scale. -# -# Returns -# ------- -# theta[i] : real scalar -# fixed distribution parameter theta[i] as function of x, logSF and -# theta[j] where j != i. -# -# LINK is a function connecting the fixed distribution parameter theta[i] -# with the quantile (x) and the survival probability (SF) and the -# remaining free distribution parameters theta[j] for j!=i, i.e.: -# theta[i] = link(x, logSF, theta, i), -# where logSF = log(Prob(X>x; theta)). -# -# See also -# estimation.Profile -# ''' -# return self._link(x, logSF, theta, i) -# -# -# def _link(self, x, logSF, theta, i): -# msg = 'Link function not implemented for the {} distribution' -# raise NotImplementedError(msg.format(self.name)) - - def _resolve_ties(self, log_dprb, x, args, scale): dx = np.diff(x, axis=0) tie = dx == 0 @@ -293,7 +127,6 @@ def _nlogps_and_penalty(self, x, scale, args): if n_bad > 0: penalty = 100.0 * np.log(_XMAX) * n_bad return -np.sum(log_dprb[finite_log_dprb], axis=0) + penalty - return -np.sum(log_dprb, axis=0) @@ -353,7 +186,7 @@ def _nnlf_and_penalty(self, x, args): return -np.sum(logpdf, axis=0) -def _penalized_nnlf(self, theta, x, penalty=None): +def _penalized_nnlf(self, theta, x): ''' Return negative loglikelihood function, i.e., - sum (log pdf(x, theta), axis=0) where theta are the parameters (including loc and scale) @@ -617,8 +450,7 @@ def _open_support_mask(self, x): rv_generic.freeze = freeze rv_discrete.freeze = freeze rv_continuous.freeze = freeze -#rv_continuous.link = link -#rv_continuous._link = _link + rv_continuous._penalized_nlogps = _penalized_nlogps rv_continuous._penalized_nnlf = _penalized_nnlf rv_continuous._reduce_func = _reduce_func diff --git a/wafo/stats/estimation.py b/wafo/stats/estimation.py index 464cea7..9a4f5e9 100644 --- a/wafo/stats/estimation.py +++ b/wafo/stats/estimation.py @@ -9,11 +9,10 @@ Author: Per A. Brodtkorb 2008 from __future__ import division, absolute_import import warnings - +from scipy.stats import rv_continuous from wafo.plotbackend import plotbackend as plt -from wafo.misc import ecross, findcross, argsreduce -from wafo.stats._constants import _EPS, _XMAX, _XMIN -from wafo.stats._distn_infrastructure import rv_frozen, rv_continuous +from wafo.misc import ecross, findcross +from wafo.stats._constants import _EPS # , _XMAX, _XMIN from scipy._lib.six import string_types import numdifftools as nd # @UnresolvedImport from scipy import special @@ -36,6 +35,157 @@ arr = asarray # all = alltrue # @ReservedAssignment +def _assert_warn(cond, msg): + if not cond: + warnings.warn(msg) + + +def _assert(cond, msg): + if not cond: + raise ValueError(msg) + + +def _assert_index(cond, msg): + if not cond: + raise IndexError(msg) + + +def _assert_not_implemented(cond, msg): + if not cond: + raise NotImplementedError(msg) + + +# Frozen RV class +class rv_frozen(object): + ''' Frozen continous or discrete 1D Random Variable object (RV) + + Methods + ------- + rvs(size=1) + Random variates. + pdf(x) + Probability density function. + cdf(x) + Cumulative density function. + sf(x) + Survival function (1-cdf --- sometimes more accurate). + ppf(q) + Percent point function (inverse of cdf --- percentiles). + isf(q) + Inverse survival function (inverse of sf). + stats(moments='mv') + Mean('m'), variance('v'), skew('s'), and/or kurtosis('k'). + moment(n) + n-th order non-central moment of distribution. + entropy() + (Differential) entropy of the RV. + interval(alpha) + Confidence interval with equal areas around the median. + expect(func, lb, ub, conditional=False) + Calculate expected value of a function with respect to the + distribution. + ''' + def __init__(self, dist, *args, **kwds): + # create a new instance + self.dist = dist # .__class__(**dist._ctor_param) + shapes, loc, scale = self.dist._parse_args(*args, **kwds) + if isinstance(dist, rv_continuous): + self.par = shapes + (loc, scale) + else: # rv_discrete + self.par = shapes + (loc,) + self.a = self.dist.a + self.b = self.dist.b + self.shapes = self.dist.shapes + # @property + # def shapes(self): + # return self.dist.shapes + + @property + def random_state(self): + return self.dist._random_state + + @random_state.setter + def random_state(self, seed): + self.dist._random_state = check_random_state(seed) + + def pdf(self, x): + ''' Probability density function at x of the given RV.''' + return self.dist.pdf(x, *self.par) + + def logpdf(self, x): + return self.dist.logpdf(x, *self.par) + + def cdf(self, x): + '''Cumulative distribution function at x of the given RV.''' + return self.dist.cdf(x, *self.par) + + def logcdf(self, x): + return self.dist.logcdf(x, *self.par) + + def ppf(self, q): + '''Percent point function (inverse of cdf) at q of the given RV.''' + return self.dist.ppf(q, *self.par) + + def isf(self, q): + '''Inverse survival function at q of the given RV.''' + return self.dist.isf(q, *self.par) + + def rvs(self, size=None, random_state=None): + kwds = {'size': size, 'random_state': random_state} + return self.dist.rvs(*self.par, **kwds) + + def sf(self, x): + '''Survival function (1-cdf) at x of the given RV.''' + return self.dist.sf(x, *self.par) + + def logsf(self, x): + return self.dist.logsf(x, *self.par) + + def stats(self, moments='mv'): + ''' Some statistics of the given RV''' + kwds = dict(moments=moments) + return self.dist.stats(*self.par, **kwds) + + def median(self): + return self.dist.median(*self.par) + + def mean(self): + return self.dist.mean(*self.par) + + def var(self): + return self.dist.var(*self.par) + + def std(self): + return self.dist.std(*self.par) + + def moment(self, n): + return self.dist.moment(n, *self.par) + + def entropy(self): + return self.dist.entropy(*self.par) + + def pmf(self, k): + '''Probability mass function at k of the given RV''' + return self.dist.pmf(k, *self.par) + + def logpmf(self, k): + return self.dist.logpmf(k, *self.par) + + def interval(self, alpha): + return self.dist.interval(alpha, *self.par) + + def expect(self, func=None, lb=None, ub=None, conditional=False, **kwds): + if isinstance(self.dist, rv_continuous): + a, loc, scale = self.par[:-2], self.par[:-2], self.par[-1] + return self.dist.expect(func, a, loc, scale, lb, ub, conditional, + **kwds) + + a, loc = self.par[:-1], self.par[-1] + if kwds: + raise ValueError("Discrete expect does not accept **kwds.") + return self.dist.expect(func, a, loc, lb, ub, conditional) + + def _burr_link(x, logsf, phat, ix): c, d, loc, scale = phat logp = log(-expm1(logsf)) @@ -90,43 +240,43 @@ def _genpareto_link(x, logsf, phat, ix): # Stuart Coles (2004) # "An introduction to statistical modelling of extreme values". # Springer series in statistics + _assert_not_implemented(ix != 0, 'link(x,logsf,phat,i) where i=0 is ' + 'not implemented!') c, loc, scale = phat + if c == 0: + return _expon_link(x, logsf, phat[1:], ix-1) if ix == 2: # Reorganizing w.r.t.scale, Eq. 4.13 and 4.14, pp 81 in # Coles (2004) gives # link = -(x-loc)*c/expm1(-c*logsf) - if c != 0.0: - phati = (x - loc) * c / expm1(-c * logsf) - else: - phati = -(x - loc) / logsf - elif ix == 1: - if c != 0: - phati = x + scale * expm1(c * logsf) / c - else: - phati = x + scale * logsf - elif ix == 0: - raise NotImplementedError( - 'link(x,logsf,phat,i) where i=0 is not implemented!') - else: - raise IndexError('Index to the fixed parameter is out of bounds') - return phati + return (x - loc) * c / expm1(-c * logsf) + if ix == 1: + return x + scale * expm1(c * logsf) / c + raise IndexError('Index to the fixed parameter is out of bounds') + + +def _gumbel_r_link(x, logsf, phat, ix): + loc, scale = phat + loglogP = log(-log(-expm1(logsf))) + if ix == 1: + return -(x - loc) / loglogP + if ix == 1: + return x + scale * loglogP + raise IndexError('Index to the fixed parameter is out of bounds') def _genextreme_link(x, logsf, phat, ix): + _assert_not_implemented(ix != 0, 'link(x,logsf,phat,i) where i=0 is not ' + 'implemented!') c, loc, scale = phat + if c == 0: + return _gumbel_r_link(x, logsf, phat[1:], ix-1) loglogP = log(-log(-expm1(logsf))) if ix == 2: # link = -(x-loc)*c/expm1(c*log(-logP)) - if c != 0.0: - return -(x - loc) * c / expm1(c * loglogP) - return -(x - loc) / loglogP + return -(x - loc) * c / expm1(c * loglogP) if ix == 1: - if c != 0: - return x + scale * expm1(c * loglogP) / c - return x + scale * loglogP - if ix == 0: - raise NotImplementedError( - 'link(x,logsf,phat,i) where i=0 is not implemented!') + return x + scale * expm1(c * loglogP) / c raise IndexError('Index to the fixed parameter is out of bounds') @@ -168,6 +318,7 @@ LINKS = dict(expon=_expon_link, frechet_r=_weibull_min_link, genpareto=_genpareto_link, genexpon=_genexpon_link, + gumbel_r=_gumbel_r_link, rayleigh=_rayleigh_link, trunclayleigh=_trunclayleigh_link, genextreme=_genextreme_link, @@ -414,9 +565,9 @@ class Profile(object): p_low, p_up = self._approx_p_min_max(p_opt) pmin, pmax = self.pmin, self.pmax if pmin is None: - pmin = self._search_pmin(phatfree0, p_low, p_opt) + pmin = self._search_pminmax(phatfree0, p_low, p_opt, 'min') if pmax is None: - pmax = self._search_pmax(phatfree0, p_up, p_opt) + pmax = self._search_pminmax(phatfree0, p_up, p_opt, 'max') return pmin, pmax def _adaptive_pvec(self, p_opt, pmin, pmax): @@ -438,58 +589,32 @@ class Profile(object): return self._adaptive_pvec(p_opt, pmin, pmax) return np.linspace(self.pmin, self.pmax, self.n) - def _search_pmin(self, phatfree0, p_min0, p_opt): + def _search_pminmax(self, phatfree0, p_minmax0, p_opt, direction): phatfree = phatfree0.copy() - - dp = np.maximum((p_opt - p_min0)/40, 0.01)*10 + sign = -1 if direction == 'min' else 1 + dp = np.maximum(sign*(p_minmax0 - p_opt) / 40, 0.01) * 10 Lmax, phatfree = self._profile_optimum(phatfree, p_opt) - p_min_opt = p_min0 + p_minmax_opt = p_minmax0 for j in range(51): - p_min = p_opt - dp - Lmax, phatfree = self._profile_optimum(phatfree, p_min) - # print((dp, p_min, p_min_opt, Lmax)) + p_minmax = p_opt + sign * dp + Lmax, phatfree = self._profile_optimum(phatfree, p_minmax) + # print((dp, p_minmax, p_minmax_opt, Lmax)) if np.isnan(Lmax): dp *= 0.33 elif Lmax < self.alpha_cross_level - self.alpha_Lrange*5*(j+1): - p_min_opt = p_min + p_minmax_opt = p_minmax dp *= 0.33 elif Lmax < self.alpha_cross_level: - p_min_opt = p_min + p_minmax_opt = p_minmax break else: dp *= 1.67 else: - msg = 'Exceeded max iterations. (p_min0={}, p_min={}, p={})' - warnings.warn(msg.format(p_min0, p_min_opt, p_opt)) + msg = 'Exceeded max iterations. (p_{0}0={1}, p_{0}={2}, p={3})' + warnings.warn(msg.format(direction, p_minmax0, p_minmax_opt, + p_opt)) # print('search_pmin iterations={}'.format(j)) - return p_min_opt - - def _search_pmax(self, phatfree0, p_max0, p_opt): - phatfree = phatfree0.copy() - - dp = (p_max0 - p_opt)/40 - if dp < 1e-2: - dp = 0.1 - Lmax, phatfree = self._profile_optimum(phatfree, p_opt) - p_max_opt = p_opt - for j in range(51): - p_max = p_opt + dp - Lmax, phatfree = self._profile_optimum(phatfree, p_max) - if np.isnan(Lmax): - dp *= 0.33 - elif Lmax < self.alpha_cross_level - self.alpha_Lrange*5*(j+1): - p_max_opt = p_max - dp *= 0.33 - elif Lmax < self.alpha_cross_level: - p_max_opt = p_max - break - else: - dp *= 1.67 - else: - msg = 'Exceeded max iterations. (p={}, p_max={}, p_max0 = {})' - warnings.warn(msg.format(p_opt, p_max_opt, p_max0)) - # print('search_pmax iterations={}'.format(j)) - return p_max_opt + return p_minmax_opt def _profile_fun(self, free_par, fix_par): ''' Return negative of loglike or logps function @@ -504,37 +629,40 @@ class Profile(object): par[self.i_fixed] = self._local_link(fix_par, par) return self.fit_dist.fitfun(par) - def get_bounds(self, alpha=0.05): - '''Return confidence interval for profiled parameter - ''' - if alpha < self.alpha: - msg = 'Might not be able to return bounds with alpha less than {}' - warnings.warn(msg.format(self.alpha)) - cross_level = self.Lmax - 0.5 * chi2isf(alpha, 1) - ind = findcross(self.data, cross_level) - n = len(ind) + def _check_bounds(self, cross_level, ind, n): if n == 0: - warnings.warn('''Number of crossings is zero, i.e., - upper and lower bound is not found!''') - bounds = (self.pmin, self.pmax) - + warnings.warn('Number of crossings is zero, i.e., upper and lower ' + 'bound is not found!') + bounds = self.pmin, self.pmax elif n == 1: x0 = ecross(self.args, self.data, ind, cross_level) is_upcrossing = self.data[ind] < self.data[ind + 1] if is_upcrossing: - bounds = (x0, self.pmax) + bounds = x0, self.pmax warnings.warn('Upper bound is larger') else: - bounds = (self.pmin, x0) + bounds = self.pmin, x0 warnings.warn('Lower bound is smaller') - - elif n == 2: - bounds = ecross(self.args, self.data, ind, cross_level) else: warnings.warn('Number of crossings too large! Something is wrong!') bounds = ecross(self.args, self.data, ind[[0, -1]], cross_level) return bounds + def get_bounds(self, alpha=0.05): + '''Return confidence interval for profiled parameter + ''' + _assert_warn(self.alpha <= alpha, 'Might not be able to return bounds ' + 'with alpha less than {}'.format(self.alpha)) + + cross_level = self.Lmax - 0.5 * chi2isf(alpha, 1) + ind = findcross(self.data, cross_level) + n = len(ind) + if n == 2: + bounds = ecross(self.args, self.data, ind, cross_level) + else: + bounds = self._check_bounds(cross_level, ind, n) + return bounds + def plot(self, axis=None): ''' Plot profile function for p_opt with 100(1-alpha)% confidence interval. @@ -1121,7 +1249,6 @@ class FitDistribution(rv_frozen): def _nnlf(self, theta, x): return self.dist._penalized_nnlf(theta, x) - def _nlogps(self, theta, x): """ Moran's negative log Product Spacings statistic @@ -1323,6 +1450,29 @@ class FitDistribution(rv_frozen): ci.append((np.nan, np.nan)) return np.array(ci) + def _fit_summary_text(self): + fixstr = '' + if self.par_fix is not None: + numfix = len(self.i_fixed) + if numfix > 0: + format0 = ', '.join(['%d'] * numfix) + format1 = ', '.join(['%g'] * numfix) + phatistr = format0 % tuple(self.i_fixed) + phatvstr = format1 % tuple(self.par[self.i_fixed]) + fixstr = 'Fixed: phat[{0:s}] = {1:s} '.format(phatistr, + phatvstr) + subtxt = ('Fit method: {0:s}, Fit p-value: {1:2.2f} {2:s}, ' + + 'phat=[{3:s}], {4:s}') + par_txt = '{:1.2g}, ' * len(self.par)[:-2].format(*self.par) + try: + LL_txt = 'Lps_max={:2.2g}, Ll_max={:2.2g}'.format(self.LPSmax, + self.LLmax) + except Exception: + LL_txt = 'Lps_max={}, Ll_max={}'.format(self.LPSmax, self.LLmax) + txt = subtxt.format(self.method.upper(), self.pvalue, fixstr, par_txt, + LL_txt) + return txt + def plotfitsummary(self, axes=None, fig=None): ''' Plot various diagnostic plots to asses the quality of the fit. @@ -1346,29 +1496,9 @@ class FitDistribution(rv_frozen): if fig is None: fig = plt.gcf() - fixstr = '' - if self.par_fix is not None: - numfix = len(self.i_fixed) - if numfix > 0: - format0 = ', '.join(['%d'] * numfix) - format1 = ', '.join(['%g'] * numfix) - phatistr = format0 % tuple(self.i_fixed) - phatvstr = format1 % tuple(self.par[self.i_fixed]) - fixstr = 'Fixed: phat[{0:s}] = {1:s} '.format(phatistr, - phatvstr) - - subtxt = ('Fit method: {0:s}, Fit p-value: {1:2.2f} {2:s}, ' + - 'phat=[{3:s}], {4:s}') - par_txt = ('{:1.2g}, '*len(self.par))[:-2].format(*self.par) - try: - LL_txt = 'Lps_max={:2.2g}, Ll_max={:2.2g}'.format(self.LPSmax, - self.LLmax) - except Exception: - LL_txt = 'Lps_max={}, Ll_max={}'.format(self.LPSmax, self.LLmax) try: - fig.text(0.05, 0.01, subtxt.format(self.method.upper(), - self.pvalue, fixstr, par_txt, - LL_txt)) + txt = self._fit_summary_text() + fig.text(0.05, 0.01, txt) except AttributeError: pass