''' Contains FitDistribution and Profile class, which are important classes for fitting to various Continous and Discrete Probability Distributions Author: Per A. Brodtkorb 2008 ''' from __future__ import division, absolute_import import warnings from wafo.plotbackend import plotbackend from wafo.misc import ecross, findcross, argsreduce from wafo.stats._constants import _EPS, _XMAX from wafo.stats._distn_infrastructure import rv_frozen, rv_continuous from scipy._lib.six import string_types import numdifftools as nd # @UnresolvedImport from scipy import special from scipy.linalg import pinv2 from scipy import optimize import numpy as np from numpy import (alltrue, arange, zeros, log, sqrt, exp, expm1, atleast_1d, any, asarray, nan, pi, isfinite) from numpy import flatnonzero as nonzero __all__ = ['Profile', 'FitDistribution'] floatinfo = np.finfo(float) arr = asarray all = alltrue # @ReservedAssignment def _expon_link(x, logSF, phat, ix): if ix == 1: return - (x - phat[0]) / logSF elif ix == 0: return x + phat[1] * logSF else: raise IndexError('Index to the fixed parameter is out of bounds') def _weibull_min_link(x, logSF, phat, ix): if ix == 0: phati = log(-logSF) / log((x - phat[1]) / phat[2]) elif ix == 1: phati = x - phat[2] * (-logSF) ** (1. / phat[0]) elif ix == 2: phati = (x - phat[1]) / (-logSF) ** (1. / phat[0]) else: raise IndexError('Index to the fixed parameter is out of bounds') return phati def _genpareto_link(x, logSF, phat, ix): # Reference # Stuart Coles (2004) # "An introduction to statistical modelling of extreme values". # Springer series in statistics c, loc, scale = phat if ix == 2: # Reorganizing w.r.t.scale, Eq. 4.13 and 4.14, pp 81 in # Coles (2004) gives # link = -(x-loc)*c/expm1(-c*logSF) if c != 0.0: phati = (x - loc) * c / expm1(-c * logSF) else: phati = -(x - loc) / logSF elif ix == 1: if c != 0: phati = x + scale * expm1(c * logSF) / c else: phati = x + scale * logSF elif ix == 0: raise NotImplementedError( 'link(x,logSF,phat,i) where i=0 is not implemented!') else: raise IndexError('Index to the fixed parameter is out of bounds') return phati def _genexpon_link(x, logSF, phat, ix): _a, b, c, loc, scale = phat xn = (x - loc) / scale fact1 = (xn + expm1(-c * xn) / c) if ix == 0: phati = b * fact1 + logSF elif ix == 1: phati = (phat[0] - logSF) / fact1 elif ix in [2, 3, 4]: raise NotImplementedError('Only implemented for index in [0,1]!') else: raise IndexError('Index to the fixed parameter is out of bounds') return phati def _rayleigh_link(x, logSF, phat, ix): if ix == 1: return x - phat[0] / sqrt(-2.0 * logSF) elif ix == 0: return x - phat[1] * sqrt(-2.0 * logSF) else: raise IndexError('Index to the fixed parameter is out of bounds') def _trunclayleigh_link(x, logSF, phat, ix): c, loc, scale = phat if ix == 2: return x - loc / (sqrt(c * c - 2 * logSF) - c) elif ix == 1: return x - scale * (sqrt(c * c - 2 * logSF) - c) elif ix == 0: xn = (x - loc) / scale return - 2 * logSF / xn - xn / 2.0 else: raise IndexError('Index to the fixed parameter is out of bounds') LINKS = dict(expon=_expon_link, weibull_min=_weibull_min_link, frechet_r=_weibull_min_link, genpareto=_genpareto_link, genexpon=_genexpon_link, rayleigh=_rayleigh_link, trunclayleigh=_trunclayleigh_link) def chi2isf(p, df): return special.chdtri(df, p) def chi2sf(x, df): return special.chdtrc(df, x) def norm_ppf(q): return special.ndtri(q) # internal class to profile parameters of a given distribution class Profile(object): ''' Profile Log- likelihood or Product Spacing-function. which can be used for constructing confidence interval for either phat[i], probability or quantile. Parameters ---------- fit_dist : FitDistribution object with ML or MPS estimated distribution parameters. **kwds : named arguments with keys i : scalar integer defining which distribution parameter to keep fixed in the profiling process (default first non-fixed parameter) pmin, pmax : real scalars Interval for either the parameter, phat(i), prb, or x, used in the optimization of the profile function (default is based on the 100*(1-alpha)% confidence interval computed with the delta method.) N : scalar integer Max number of points used in Lp (default 100) x : real scalar Quantile (return value) (default None) logSF : real scalar log survival probability,i.e., SF = Prob(X>x;phat) (default None) link : function connecting the x-quantile and the survival probability (SF) with the fixed distribution parameter, i.e.: self.par[i] = link(x, logSF, self.par, i), where logSF = log(Prob(X>x;phat)). This means that if: 1) x is not None then x is profiled 2) logSF is not None then logSF is profiled 3) x and logSF are None then self.par[i] is profiled (default) alpha : real scalar confidence coefficent (default 0.05) Returns ------- Lp : Profile log-likelihood function with parameters phat given the data, phat(i), probability (prb) and quantile (x) (if given), i.e., Lp = max(log(f(phat|data,phat(i)))), or Lp = max(log(f(phat|data,phat(i),x,prb))) Member methods ------------- plot() : Plot profile function with 100(1-alpha)% confidence interval get_bounds() : Return 100(1-alpha)% confidence interval Member variables ---------------- fit_dist : FitDistribution data object. data : profile function values args : profile function arguments alpha : confidence coefficient Lmax : Maximum value of profile function alpha_cross_level : PROFILE is a utility function for making inferences either on a particular component of the vector phat or the quantile, x, or the probability, SF. This is usually more accurate than using the delta method assuming asymptotic normality of the ML estimator or the MPS estimator. Examples -------- # MLE >>> import wafo.stats as ws >>> R = ws.weibull_min.rvs(1,size=100); >>> phat = FitDistribution(ws.weibull_min, R, 1, scale=1, floc=0.0) # Better CI for phat.par[i=0] >>> Lp = Profile(phat, i=0) >>> Lp.plot() >>> phat_ci = Lp.get_bounds(alpha=0.1) >>> SF = 1./990 >>> x = phat.isf(SF) # CI for x >>> Lx = phat.profile(i=0, x=x, link=phat.dist.link) >>> Lx.plot() >>> x_ci = Lx.get_bounds(alpha=0.2) # CI for logSF=log(SF) >>> Lsf = phat.profile(i=0, logSF=log(SF), link=phat.dist.link) >>> Lsf.plot() >>> sf_ci = Lsf.get_bounds(alpha=0.2) ''' def __init__(self, fit_dist, method=None, **kwds): if method is None: method = fit_dist.method.lower() try: i0 = (1 - np.isfinite(fit_dist.par_fix)).argmax() except: i0 = 0 self.fit_dist = fit_dist self.data = None self.args = None self.title = '' self.xlabel = '' self.ylabel = 'Profile log' (self.i_fixed, self.N, self.alpha, self.pmin, self.pmax, self.x, self.logSF, self.link) = [kwds.get(name, val) for name, val in zip( ['i', 'N', 'alpha', 'pmin', 'pmax', 'x', 'logSF', 'link'], [i0, 100, 0.05, None, None, None, None, None])] self.title = '%g%s CI' % (100 * (1.0 - self.alpha), '%') if method.startswith('ml'): self.ylabel = self.ylabel + 'likelihood' Lmax = fit_dist.LLmax elif method.startswith('mps'): self.ylabel = self.ylabel + ' product spacing' Lmax = fit_dist.LPSmax else: raise ValueError( "PROFILE is only valid for ML- or MPS- estimators") if fit_dist.par_fix is None: isnotfixed = np.ones(fit_dist.par.shape, dtype=bool) else: isnotfixed = 1 - np.isfinite(fit_dist.par_fix) self.i_notfixed = nonzero(isnotfixed) self.i_fixed = atleast_1d(self.i_fixed) if 1 - isnotfixed[self.i_fixed]: raise IndexError( "Index i must be equal to an index to one of the free " + "parameters.") isfree = isnotfixed isfree[self.i_fixed] = False self.i_free = nonzero(isfree) self.Lmax = Lmax self.alpha_Lrange = 0.5 * chi2isf(self.alpha, 1) self.alpha_cross_level = Lmax - self.alpha_Lrange # lowLevel = self.alpha_cross_level - self.alpha_Lrange / 7.0 phatv = fit_dist.par.copy() self._par = phatv.copy() # Set up variable to profile and _local_link function self.profile_x = self.x is not None self.profile_logSF = not (self.logSF is None or self.profile_x) self.profile_par = not (self.profile_x or self.profile_logSF) if self.link is None: self.link = LINKS[self.fit_dist.dist.name] if self.profile_par: self._local_link = self._par_link self.xlabel = 'phat(%d)' % self.i_fixed p_opt = self._par[self.i_fixed] elif self.profile_x: self.logSF = fit_dist.logsf(self.x) self._local_link = self._x_link self.xlabel = 'x' p_opt = self.x elif self.profile_logSF: p_opt = self.logSF self.x = fit_dist.isf(exp(p_opt)) self._local_link = self._logSF_link self.xlabel = 'log(SF)' else: raise ValueError( "You must supply a non-empty quantile (x) or probability " + "(logSF) in order to profile it!") self.xlabel = self.xlabel + ' (' + fit_dist.dist.name + ')' phatfree = phatv[self.i_free].copy() self._set_profile(phatfree, p_opt) def _par_link(self, fix_par, par): return fix_par def _x_link(self, fix_par, par): return self.link(fix_par, self.logSF, par, self.i_fixed) def _logSF_link(self, fix_par, par): return self.link(self.x, fix_par, par, self.i_fixed) def _correct_Lmax(self, Lmax): if Lmax > self.Lmax: # foundNewphat = True warnings.warn( 'The fitted parameters does not provide the optimum fit. ' + 'Something wrong with fit') dL = self.Lmax - Lmax self.alpha_cross_level -= dL self.Lmax = Lmax def _profile_optimum(self, phatfree0, p_opt): phatfree = optimize.fmin( self._profile_fun, phatfree0, args=(p_opt,), disp=0) Lmax = -self._profile_fun(phatfree, p_opt) self._correct_Lmax(Lmax) return Lmax, phatfree def _set_profile(self, phatfree0, p_opt): pvec = self._get_pvec(phatfree0, p_opt) self.data = np.ones_like(pvec) * nan k1 = (pvec >= p_opt).argmax() for size, step in ((-1, -1), (pvec.size, 1)): phatfree = phatfree0.copy() for ix in range(k1, size, step): Lmax, phatfree = self._profile_optimum(phatfree, pvec[ix]) self.data[ix] = Lmax if self.data[ix] < self.alpha_cross_level: break np.putmask(pvec, np.isnan(self.data), nan) self.args = pvec self._prettify_profile() def _prettify_profile(self): pvec = self.args ix = nonzero(np.isfinite(pvec)) self.data = self.data[ix] self.args = pvec[ix] cond = self.data == -np.inf if any(cond): ind, = cond.nonzero() self.data.put(ind, floatinfo.min / 2.0) ind1 = np.where(ind == 0, ind, ind - 1) cl = self.alpha_cross_level - self.alpha_Lrange / 2.0 t0 = ecross(self.args, self.data, ind1, cl) self.data.put(ind, cl) self.args.put(ind, t0) def _get_variance(self): if self.profile_par: pvar = self.fit_dist.par_cov[self.i_fixed, :][:, self.i_fixed] else: i_notfixed = self.i_notfixed phatv = self._par if self.profile_x: gradfun = nd.Gradient(self._myinvfun) else: gradfun = nd.Gradient(self._myprbfun) drl = gradfun(phatv[self.i_notfixed]) pcov = self.fit_dist.par_cov[i_notfixed, :][:, i_notfixed] pvar = np.sum(np.dot(drl, pcov) * drl) return pvar def _get_pvec(self, phatfree0, p_opt): ''' return proper interval for the variable to profile ''' linspace = np.linspace if self.pmin is None or self.pmax is None: pvar = self._get_variance() if pvar <= 1e-5 or np.isnan(pvar): pvar = max(abs(p_opt) * 0.5, 0.5) p_crit = (-norm_ppf(self.alpha / 2.0) * sqrt(np.ravel(pvar)) * 1.5) if self.pmin is None: self.pmin = self._search_pmin(phatfree0, p_opt - 5.0 * p_crit, p_opt) p_crit_low = (p_opt - self.pmin) / 5 if self.pmax is None: self.pmax = self._search_pmax(phatfree0, p_opt + 5.0 * p_crit, p_opt) p_crit_up = (self.pmax - p_opt) / 5 N4 = np.floor(self.N / 4.0) pvec1 = linspace(self.pmin, p_opt - p_crit_low, N4 + 1) pvec2 = linspace( p_opt - p_crit_low, p_opt + p_crit_up, self.N - 2 * N4) pvec3 = linspace(p_opt + p_crit_up, self.pmax, N4 + 1) pvec = np.unique(np.hstack((pvec1, p_opt, pvec2, pvec3))) else: pvec = linspace(self.pmin, self.pmax, self.N) return pvec def _search_pmin(self, phatfree0, p_min0, p_opt): phatfree = phatfree0.copy() dp = p_opt - p_min0 if dp < 1e-2: dp = 0.1 p_min_opt = p_min0 Lmax, phatfree = self._profile_optimum(phatfree, p_opt) for _j in range(50): p_min = p_opt - dp Lmax, phatfree = self._profile_optimum(phatfree, p_min) if np.isnan(Lmax): dp *= 0.33 elif Lmax < self.alpha_cross_level - self.alpha_Lrange * 2: p_min_opt = p_min dp *= 0.33 elif Lmax < self.alpha_cross_level: p_min_opt = p_min break else: dp *= 1.67 return p_min_opt def _search_pmax(self, phatfree0, p_max0, p_opt): phatfree = phatfree0.copy() dp = p_max0 - p_opt if dp < 1e-2: dp = 0.1 p_max_opt = p_max0 Lmax, phatfree = self._profile_optimum(phatfree, p_opt) for _j in range(50): p_max = p_opt + dp Lmax, phatfree = self._profile_optimum(phatfree, p_max) if np.isnan(Lmax): dp *= 0.33 elif Lmax < self.alpha_cross_level - self.alpha_Lrange * 2: p_max_opt = p_max dp *= 0.33 elif Lmax < self.alpha_cross_level: p_max_opt = p_max break else: dp *= 1.67 return p_max_opt def _myinvfun(self, phatnotfixed): mphat = self._par.copy() mphat[self.i_notfixed] = phatnotfixed prb = exp(self.logSF) return self.fit_dist.dist.isf(prb, *mphat) def _myprbfun(self, phatnotfixed): mphat = self._par.copy() mphat[self.i_notfixed] = phatnotfixed logSF = self.fit_dist.dist.logsf(self.x, *mphat) return np.where(np.isfinite(logSF), logSF, np.nan) def _profile_fun(self, free_par, fix_par): ''' Return negative of loglike or logps function free_par - vector of free parameters fix_par - fixed parameter, i.e., either quantile (return level), probability (return period) or distribution parameter ''' par = self._par.copy() par[self.i_free] = free_par # _local_link: connects fixed quantile or probability with fixed # distribution parameter par[self.i_fixed] = self._local_link(fix_par, par) return self.fit_dist.fitfun(par) def get_bounds(self, alpha=0.05): '''Return confidence interval for profiled parameter ''' if alpha < self.alpha: warnings.warn( 'Might not be able to return CI with alpha less than %g' % self.alpha) cross_level = self.Lmax - 0.5 * chi2isf(alpha, 1) ind = findcross(self.data, cross_level) N = len(ind) if N == 0: warnings.warn('''Number of crossings is zero, i.e., upper and lower bound is not found!''') CI = (self.pmin, self.pmax) elif N == 1: x0 = ecross(self.args, self.data, ind, cross_level) isUpcrossing = self.data[ind] > self.data[ind + 1] if isUpcrossing: CI = (x0, self.pmax) warnings.warn('Upper bound is larger') else: CI = (self.pmin, x0) warnings.warn('Lower bound is smaller') elif N == 2: CI = ecross(self.args, self.data, ind, cross_level) else: warnings.warn('Number of crossings too large! Something is wrong!') CI = ecross(self.args, self.data, ind[[0, -1]], cross_level) return CI def plot(self, axis=None): ''' Plot profile function with 100(1-alpha)% CI ''' if axis is None: axis = plotbackend.gca() p_ci = self.get_bounds(self.alpha) axis.plot( self.args, self.data, self.args[[0, -1]], [self.Lmax, ] * 2, 'r--', self.args[[0, -1]], [self.alpha_cross_level, ] * 2, 'r--') axis.vlines(p_ci, ymin=axis.get_ylim()[0], ymax=self.Lmax, # self.alpha_cross_level, color='r', linestyles='--') axis.set_title(self.title) axis.set_ylabel(self.ylabel) axis.set_xlabel(self.xlabel) class FitDistribution(rv_frozen): ''' Return estimators to shape, location, and scale from data Starting points for the fit are given by input arguments. For any arguments not given starting points, dist._fitstart(data) is called to get the starting estimates. You can hold some parameters fixed to specific values by passing in keyword arguments f0..fn for shape paramters and floc, fscale for location and scale parameters. Parameters ---------- dist : scipy distribution object distribution to fit to data data : array-like Data to use in calculating the ML or MPS estimators args : optional Starting values for any shape arguments (those not specified will be determined by dist._fitstart(data)) kwds : loc, scale Starting values for the location and scale parameters Special keyword arguments are recognized as holding certain parameters fixed: f0..fn : hold respective shape paramters fixed floc : hold location parameter fixed to specified value fscale : hold scale parameter fixed to specified value method : of estimation. Options are 'ml' : Maximum Likelihood method (default) 'mps': Maximum Product Spacing method alpha : scalar, optional Confidence coefficent (default=0.05) search : bool If true search for best estimator (default), otherwise return object with initial distribution parameters copydata : bool If true copydata (default) optimizer : The optimizer to use. The optimizer must take func, and starting position as the first two arguments, plus args (for extra arguments to pass to the function to be optimized) and disp=0 to suppress output as keyword arguments. Return ------ phat : FitDistribution object Fitted distribution object with following member variables: LLmax : loglikelihood function evaluated using par LPSmax : log product spacing function evaluated using par pvalue : p-value for the fit par : distribution parameters (fixed and fitted) par_cov : covariance of distribution parameters par_fix : fixed distribution parameters par_lower : lower (1-alpha)% confidence bound for the parameters par_upper : upper (1-alpha)% confidence bound for the parameters Note ---- `data` is sorted using this function, so if `copydata`==False the data in your namespace will be sorted as well. Examples -------- Estimate distribution parameters for weibull_min distribution. >>> import wafo.stats as ws >>> R = ws.weibull_min.rvs(1,size=100); >>> phat = FitDistribution(ws.weibull_min, R, 1, scale=1, floc=0.0) #Plot various diagnostic plots to asses quality of fit. >>> phat.plotfitsummary() #phat.par holds the estimated parameters #phat.par_upper upper CI for parameters #phat.par_lower lower CI for parameters #Better CI for phat.par[0] >>> Lp = phat.profile(i=0) >>> Lp.plot() >>> p_ci = Lp.get_bounds(alpha=0.1) >>> SF = 1./990 >>> x = phat.isf(SF) # CI for x >>> Lx = phat.profile(i=0,x=x,link=phat.dist.link) >>> Lx.plot() >>> x_ci = Lx.get_bounds(alpha=0.2) # CI for logSF=log(SF) >>> Lsf = phat.profile(i=0, logSF=log(SF), link=phat.dist.link) >>> Lsf.plot() >>> sf_ci = Lsf.get_bounds(alpha=0.2) ''' def __init__(self, dist, data, args=(), method='ML', alpha=0.05, par_fix=None, search=True, copydata=True, **kwds): extradoc = ''' plotfitsummary() Plot various diagnostic plots to asses quality of fit. plotecdf() Plot Empirical and fitted Cumulative Distribution Function plotesf() Plot Empirical and fitted Survival Function plotepdf() Plot Empirical and fitted Probability Distribution Function plotresq() Displays a residual quantile plot. plotresprb() Displays a residual probability plot. profile() Return Profile Log- likelihood or Product Spacing-function. Parameters ---------- x : array-like quantiles q : array-like lower or upper tail probability size : int or tuple of ints, optional shape of random variates (default computed from input arguments ) moments : str, optional composed of letters ['mvsk'] specifying which moments to compute where 'm' = mean, 'v' = variance, 's' = (Fisher's) skew and 'k' = (Fisher's) kurtosis. (default='mv') ''' # Member variables # ---------------- # data - data used in fitting # alpha - confidence coefficient # method - method used # LLmax - loglikelihood function evaluated using par # LPSmax - log product spacing function evaluated using par # pvalue - p-value for the fit # search - True if search for distribution parameters (default) # copydata - True if copy input data (default) # # par - parameters (fixed and fitted) # par_cov - covariance of parameters # par_fix - fixed parameters # par_lower - lower (1-alpha)% confidence bound for the parameters # par_upper - upper (1-alpha)% confidence bound for the parameters # # ''' self.__doc__ = str(rv_frozen.__doc__) + extradoc self.dist = dist numargs = dist.numargs self.method = method.lower() self.alpha = alpha self.par_fix = par_fix self.search = search self.copydata = copydata if self.method.startswith('mps'): self._fitfun = self._nlogps else: self._fitfun = self._nnlf self.data = np.ravel(data) if self.copydata: self.data = self.data.copy() self.data.sort() par, fixedn = self._fit(*args, **kwds.copy()) super(FitDistribution, self).__init__(dist, *par) self.par = arr(par) somefixed = len(fixedn) > 0 if somefixed: self.par_fix = [nan, ] * len(self.par) for i in fixedn: self.par_fix[i] = self.par[i] self.i_notfixed = nonzero(1 - isfinite(self.par_fix)) self.i_fixed = nonzero(isfinite(self.par_fix)) numpar = numargs + 2 self.par_cov = zeros((numpar, numpar)) self._compute_cov() # Set confidence interval for parameters pvar = np.diag(self.par_cov) zcrit = -norm_ppf(self.alpha / 2.0) self.par_lower = self.par - zcrit * sqrt(pvar) self.par_upper = self.par + zcrit * sqrt(pvar) self.LLmax = -self._nnlf(self.par, self.data) self.LPSmax = -self._nlogps(self.par, self.data) self.pvalue = self._pvalue(self.par, self.data, unknown_numpar=numpar) def __repr__(self): params = ['alpha', 'method', 'LLmax', 'LPSmax', 'pvalue', 'par', 'par_lower', 'par_upper', 'par_fix', 'par_cov'] t = ['%s:\n' % self.__class__.__name__] for par in params: t.append('%s = %s\n' % (par, str(getattr(self, par)))) return ''.join(t) def _reduce_func(self, args, kwds): # First of all, convert fshapes params to fnum: eg for stats.beta, # shapes='a, b'. To fix `a`, can specify either `f1` or `fa`. # Convert the latter into the former. shapes = self.dist.shapes if shapes: shapes = shapes.replace(',', ' ').split() for j, s in enumerate(shapes): val = kwds.pop('f' + s, None) or kwds.pop('fix_' + s, None) if val is not None: key = 'f%d' % j if key in kwds: raise ValueError("Duplicate entry for %s." % key) else: kwds[key] = val args = list(args) Nargs = len(args) fixedn = [] names = ['f%d' % n for n in range(Nargs - 2)] + ['floc', 'fscale'] x0 = [] for n, key in enumerate(names): if key in kwds: fixedn.append(n) args[n] = kwds.pop(key) else: x0.append(args[n]) fitfun = self._fitfun if len(fixedn) == 0: func = fitfun restore = None else: if len(fixedn) == Nargs: raise ValueError("All parameters fixed. " + "There is nothing to optimize.") def restore(args, theta): # Replace with theta for all numbers not in fixedn # This allows the non-fixed values to vary, but # we still call self.nnlf with all parameters. i = 0 for n in range(Nargs): if n not in fixedn: args[n] = theta[i] i += 1 return args def func(theta, x): newtheta = restore(args[:], theta) return fitfun(newtheta, x) return x0, func, restore, args, fixedn @staticmethod def _hessian(nnlf, theta, data, eps=None): ''' approximate hessian of nnlf where theta are the parameters (including loc and scale) ''' if eps is None: eps = (_EPS) ** 0.4 num_par = len(theta) # pab 07.01.2001: Always choose the stepsize h so that # it is an exactly representable number. # This is important when calculating numerical derivatives and is # accomplished by the following. delta = (eps + 2.0) - 2.0 delta2 = delta ** 2.0 # Approximate 1/(nE( (d L(x|theta)/dtheta)^2)) with # 1/(d^2 L(theta|x)/dtheta^2) # using central differences LL = nnlf(theta, data) H = zeros((num_par, num_par)) # Hessian matrix theta = tuple(theta) for ix in range(num_par): sparam = list(theta) sparam[ix] = theta[ix] + delta fp = nnlf(sparam, data) sparam[ix] = theta[ix] - delta fm = nnlf(sparam, data) H[ix, ix] = (fp - 2 * LL + fm) / delta2 for iy in range(ix + 1, num_par): sparam[ix] = theta[ix] + delta sparam[iy] = theta[iy] + delta fpp = nnlf(sparam, data) sparam[iy] = theta[iy] - delta fpm = nnlf(sparam, data) sparam[ix] = theta[ix] - delta fmm = nnlf(sparam, data) sparam[iy] = theta[iy] + delta fmp = nnlf(sparam, data) H[ix, iy] = ((fpp + fmm) - (fmp + fpm)) / (4. * delta2) H[iy, ix] = H[ix, iy] sparam[iy] = theta[iy] return -H def _nnlf(self, theta, x): return self.dist._penalized_nnlf(theta, x) def _nlogps(self, theta, x): """ Moran's negative log Product Spacings statistic where theta are the parameters (including loc and scale) Note the data in x must be sorted References ----------- R. C. H. Cheng; N. A. K. Amin (1983) "Estimating Parameters in Continuous Univariate Distributions with a Shifted Origin.", Journal of the Royal Statistical Society. Series B (Methodological), Vol. 45, No. 3. (1983), pp. 394-403. R. C. H. Cheng; M. A. Stephens (1989) "A Goodness-Of-Fit Test Using Moran's Statistic with Estimated Parameters", Biometrika, 76, 2, pp 385-392 Wong, T.S.T. and Li, W.K. (2006) "A note on the estimation of extreme value distributions using maximum product of spacings.", IMS Lecture Notes Monograph Series 2006, Vol. 52, pp. 272-283 """ n = 2 if isinstance(self.dist, rv_continuous) else 1 try: loc = theta[-n] scale = theta[-1] args = tuple(theta[:-n]) except IndexError: raise ValueError("Not enough input arguments.") if not isinstance(self.dist, rv_continuous): scale = 1 if not self.dist._argcheck(*args) or scale <= 0: return np.inf dist = self.dist x = asarray((x - loc) / scale) cond0 = (x < dist.a) | (dist.b < x) Nbad = np.sum(cond0) if Nbad > 0: x = argsreduce(~cond0, x)[0] lowertail = True if lowertail: prb = np.hstack((0.0, dist.cdf(x, *args), 1.0)) dprb = np.diff(prb) else: prb = np.hstack((1.0, dist.sf(x, *args), 0.0)) dprb = -np.diff(prb) logD = log(dprb) dx = np.diff(x, axis=0) tie = (dx == 0) if any(tie): # TODO : implement this method for treating ties in data: # Assume measuring error is delta. Then compute # yL = F(xi - delta, theta) # yU = F(xi + delta, theta) # and replace # logDj = log((yU-yL)/(r-1)) for j = i+1,i+2,...i+r-1 # The following is OK when only minimization of T is wanted i_tie, = np.nonzero(tie) tiedata = x[i_tie] logD[i_tie + 1] = log(dist._pdf(tiedata, *args)) - log(scale) finiteD = np.isfinite(logD) nonfiniteD = 1 - finiteD Nbad += np.sum(nonfiniteD, axis=0) if Nbad > 0: T = -np.sum(logD[finiteD], axis=0) + 100.0 * log(_XMAX) * Nbad else: T = -np.sum(logD, axis=0) return T def _fit(self, *args, **kwds): dist = self.dist data = self.data Narg = len(args) if Narg > dist.numargs: raise ValueError("Too many input arguments.") start = [None] * 2 if (Narg < dist.numargs) or not ('loc' in kwds and 'scale' in kwds): # get distribution specific starting locations start = dist._fitstart(data) args += start[Narg:-2] loc = kwds.pop('loc', start[-2]) scale = kwds.pop('scale', start[-1]) args += (loc, scale) x0, func, restore, args, fixedn = self._reduce_func(args, kwds) if self.search: optimizer = kwds.pop('optimizer', optimize.fmin) # convert string to function in scipy.optimize if not callable(optimizer) and isinstance(optimizer, string_types): if not optimizer.startswith('fmin_'): optimizer = "fmin_" + optimizer if optimizer == 'fmin_': optimizer = 'fmin' try: optimizer = getattr(optimize, optimizer) except AttributeError: raise ValueError("%s is not a valid optimizer" % optimizer) # by now kwds must be empty, since everybody took what they needed if kwds: raise TypeError("Unknown arguments: %s." % kwds) vals = optimizer(func, x0, args=(np.ravel(data),), disp=0) vals = tuple(vals) else: vals = tuple(x0) if restore is not None: vals = restore(args, vals) return vals, fixedn def _compute_cov(self): '''Compute covariance ''' somefixed = (self.par_fix is not None) and any(isfinite(self.par_fix)) H = np.asmatrix(self._hessian(self._fitfun, self.par, self.data)) self.H = H try: if somefixed: allfixed = all(isfinite(self.par_fix)) if allfixed: self.par_cov[:, :] = 0 else: pcov = -pinv2(H[self.i_notfixed, :][..., self.i_notfixed]) for row, ix in enumerate(list(self.i_notfixed)): self.par_cov[ix, self.i_notfixed] = pcov[row, :] else: self.par_cov = -pinv2(H) except: self.par_cov[:, :] = nan def fitfun(self, phat): return self._fitfun(phat, self.data) def profile(self, **kwds): ''' Profile Log- likelihood or Log Product Spacing- function, which can be used for constructing confidence interval for either phat(i), probability or quantile. Parameters ---------- **kwds : named arguments with keys i : scalar integer defining which distribution parameter to profile, i.e. which parameter to keep fixed (default first non-fixed parameter) pmin, pmax : real scalars Interval for either the parameter, phat(i), prb, or x, used in the optimization of the profile function (default is based on the 100*(1-alpha)% confidence interval computed with the delta method.) N : scalar integer Max number of points used in Lp (default 100) x : real scalar Quantile (return value) (default None) logSF : real scalar log survival probability,i.e., SF = Prob(X>x;phat) (default None) link : function connecting the x-quantile and the survival probability (SF) with the fixed distribution parameter, i.e.: self.par[i] = link(x,logSF,self.par,i), where logSF = log(Prob(X>x;phat)). This means that if: 1) x is not None then x is profiled 2) logSF is not None then logSF is profiled 3) x and logSF are None then self.par[i] is profiled (default) alpha : real scalar confidence coefficent (default 0.05) Returns ------- Lp : Profile log-likelihood function with parameters phat given the data, phat(i), probability (prb) and quantile (x), i.e., Lp = max(log(f(phat|data,phat(i)))), or Lp = max(log(f(phat|data,phat(i),x,prb))) Member methods ------------- plot() : Plot profile function with 100(1-alpha)% confidence interval get_bounds() : Return 100(1-alpha)% confidence interval Member variables ---------------- fit_dist : FitDistribution data object. data : profile function values args : profile function arguments alpha : confidence coefficient Lmax : Maximum value of profile function alpha_cross_level : PROFILE is a utility function for making inferences either on a particular component of the vector phat or the quantile, x, or the probability, SF. This is usually more accurate than using the delta method assuming asymptotic normality of the ML estimator or the MPS estimator. Examples -------- # MLE >>> import wafo.stats as ws >>> R = ws.weibull_min.rvs(1,size=100); >>> phat = FitDistribution(ws.weibull_min, R, 1, scale=1, floc=0.0) # Better CI for phat.par[i=0] >>> Lp = Profile(phat, i=0) >>> Lp.plot() >>> phat_ci = Lp.get_bounds(alpha=0.1) >>> SF = 1./990 >>> x = phat.isf(SF) # CI for x >>> Lx = phat.profile(i=0, x=x, link=phat.dist.link) >>> Lx.plot() >>> x_ci = Lx.get_bounds(alpha=0.2) # CI for logSF=log(SF) >>> Lsf = phat.profile(i=0, logSF=log(SF), link=phat.dist.link) >>> Lsf.plot() >>> sf_ci = Lsf.get_bounds(alpha=0.2) See also -------- Profile ''' return Profile(self, **kwds) def plotfitsummary(self): ''' Plot various diagnostic plots to asses the quality of the fit. PLOTFITSUMMARY displays probability plot, density plot, residual quantile plot and residual probability plot. The purpose of these plots is to graphically assess whether the data could come from the fitted distribution. If so the empirical- CDF and PDF should follow the model and the residual plots will be linear. Other distribution types will introduce curvature in the residual plots. ''' plotbackend.subplot(2, 2, 1) # self.plotecdf() self.plotesf() plotbackend.subplot(2, 2, 2) self.plotepdf() plotbackend.subplot(2, 2, 3) self.plotresq() plotbackend.subplot(2, 2, 4) self.plotresprb() fixstr = '' if self.par_fix is not None: numfix = len(self.i_fixed) if numfix > 0: format0 = ', '.join(['%d'] * numfix) format1 = ', '.join(['%g'] * numfix) phatistr = format0 % tuple(self.i_fixed) phatvstr = format1 % tuple(self.par[self.i_fixed]) fixstr = 'Fixed: phat[%s] = %s ' % (phatistr, phatvstr) infostr = 'Fit method: %s, Fit p-value: %2.2f %s' % ( self.method.upper(), self.pvalue, fixstr) try: plotbackend.figtext(0.05, 0.01, infostr) except: pass def plotesf(self, symb1='r-', symb2='b.'): ''' Plot Empirical and fitted Survival Function The purpose of the plot is to graphically assess whether the data could come from the fitted distribution. If so the empirical CDF should resemble the model CDF. Other distribution types will introduce deviations in the plot. ''' n = len(self.data) SF = (arange(n, 0, -1)) / n plotbackend.semilogy( self.data, SF, symb2, self.data, self.sf(self.data), symb1) # plotbackend.plot(self.data,SF,'b.',self.data,self.sf(self.data),'r-') plotbackend.xlabel('x') plotbackend.ylabel('F(x) (%s)' % self.dist.name) plotbackend.title('Empirical SF plot') def plotecdf(self, symb1='r-', symb2='b.'): ''' Plot Empirical and fitted Cumulative Distribution Function The purpose of the plot is to graphically assess whether the data could come from the fitted distribution. If so the empirical CDF should resemble the model CDF. Other distribution types will introduce deviations in the plot. ''' n = len(self.data) F = (arange(1, n + 1)) / n plotbackend.plot(self.data, F, symb2, self.data, self.cdf(self.data), symb1) plotbackend.xlabel('x') plotbackend.ylabel('F(x) (%s)' % self.dist.name) plotbackend.title('Empirical CDF plot') def _get_grid(self, odd=False): x = np.atleast_1d(self.data) n = np.ceil(4 * np.sqrt(np.sqrt(len(x)))) mn = x.min() mx = x.max() d = (mx - mn) / n * 2 e = np.floor(np.log(d) / np.log(10)) m = np.floor(d / 10 ** e) if m > 5: m = 5 elif m > 2: m = 2 d = m * 10 ** e mn = (np.floor(mn / d) - 1) * d - odd * d / 2 mx = (np.ceil(mx / d) + 1) * d + odd * d / 2 limits = np.arange(mn, mx, d) return limits def _staircase(self, x, y): xx = x.reshape(-1, 1).repeat(3, axis=1).ravel()[1:-1] yy = y.reshape(-1, 1).repeat(3, axis=1) # yy[0,0] = 0.0 # pdf yy[:, 0] = 0.0 # histogram yy.shape = (-1,) yy = np.hstack((yy, 0.0)) return xx, yy def _get_empirical_pdf(self): limits = self._get_grid() pdf, x = np.histogram(self.data, bins=limits, normed=True) return self._staircase(x, pdf) def plotepdf(self, symb1='r-', symb2='b-'): '''Plot Empirical and fitted Probability Density Function The purpose of the plot is to graphically assess whether the data could come from the fitted distribution. If so the histogram should resemble the model density. Other distribution types will introduce deviations in the plot. ''' x, pdf = self._get_empirical_pdf() ymax = pdf.max() # plotbackend.hist(self.data,normed=True,fill=False) plotbackend.plot(self.data, self.pdf(self.data), symb1, x, pdf, symb2) ax = list(plotbackend.axis()) ax[3] = min(ymax * 1.3, ax[3]) plotbackend.axis(ax) plotbackend.xlabel('x') plotbackend.ylabel('f(x) (%s)' % self.dist.name) plotbackend.title('Density plot') def plotresq(self, symb1='r-', symb2='b.'): '''PLOTRESQ displays a residual quantile plot. The purpose of the plot is to graphically assess whether the data could come from the fitted distribution. If so the plot will be linear. Other distribution types will introduce curvature in the plot. ''' n = len(self.data) eprob = (arange(1, n + 1) - 0.5) / n y = self.ppf(eprob) y1 = self.data[[0, -1]] plotbackend.plot(self.data, y, symb2, y1, y1, symb1) plotbackend.xlabel('Empirical') plotbackend.ylabel('Model (%s)' % self.dist.name) plotbackend.title('Residual Quantile Plot') plotbackend.axis('tight') plotbackend.axis('equal') def plotresprb(self, symb1='r-', symb2='b.'): ''' PLOTRESPRB displays a residual probability plot. The purpose of the plot is to graphically assess whether the data could come from the fitted distribution. If so the plot will be linear. Other distribution types will introduce curvature in the plot. ''' n = len(self.data) # ecdf = (0.5:n-0.5)/n; ecdf = arange(1, n + 1) / (n + 1) mcdf = self.cdf(self.data) p1 = [0, 1] plotbackend.plot(ecdf, mcdf, symb2, p1, p1, symb1) plotbackend.xlabel('Empirical') plotbackend.ylabel('Model (%s)' % self.dist.name) plotbackend.title('Residual Probability Plot') plotbackend.axis('equal') plotbackend.axis([0, 1, 0, 1]) def _pvalue(self, theta, x, unknown_numpar=None): ''' Return P-value for the fit using Moran's negative log Product Spacings statistic where theta are the parameters (including loc and scale) Note: the data in x must be sorted ''' dx = np.diff(x, axis=0) tie = (dx == 0) if any(tie): warnings.warn( 'P-value is on the conservative side (i.e. too large) due to' + ' ties in the data!') T = self.dist.nlogps(theta, x) n = len(x) np1 = n + 1 if unknown_numpar is None: k = len(theta) else: k = unknown_numpar isParUnKnown = True m = (np1) * (log(np1) + 0.57722) - 0.5 - 1.0 / (12. * (np1)) v = (np1) * (pi ** 2. / 6.0 - 1.0) - 0.5 - 1.0 / (6. * (np1)) C1 = m - sqrt(0.5 * n * v) C2 = sqrt(v / (2.0 * n)) # chi2 with n degrees of freedom Tn = (T + 0.5 * k * isParUnKnown - C1) / C2 pvalue = chi2sf(Tn, n) # _WAFODIST.chi2.sf(Tn, n) return pvalue def test_doctstrings(): import doctest doctest.testmod() def test1(): import wafo.stats as ws dist = ws.weibull_min # dist = ws.bradford R = dist.rvs(0.3, size=1000) phat = FitDistribution(dist, R, method='ml') # Better CI for phat.par[i=0] Lp1 = Profile(phat, i=0) # @UnusedVariable Lp1.plot() import matplotlib.pyplot as plt plt.show() # Lp2 = Profile(phat, i=2) # SF = 1./990 # x = phat.isf(SF) # # CI for x # Lx = Profile(phat, i=0,x=x,link=phat.dist.link) # Lx.plot() # x_ci = Lx.get_bounds(alpha=0.2) # # CI for logSF=log(SF) # Lsf = phat.profile(i=0, logSF=log(SF), link=phat.dist.link) # Lsf.plot() # sf_ci = Lsf.get_bounds(alpha=0.2) # pass # _WAFODIST = ppimport('wafo.stats.distributions') # nbinom(10, 0.75).rvs(3) # import matplotlib # matplotlib.interactive(True) # t = _WAFODIST.bernoulli(0.75).rvs(3) # x = np.r_[5, 10] # npr = np.r_[9, 9] # t2 = _WAFODIST.bd0(x, npr) # Examples MLE and better CI for phat.par[0] # R = _WAFODIST.weibull_min.rvs(1, size=100); # phat = _WAFODIST.weibull_min.fit(R, 1, 1, par_fix=[nan, 0, nan]) # Lp = phat.profile(i=0) # Lp.plot() # Lp.get_bounds(alpha=0.1) # R = 1. / 990 # x = phat.isf(R) # # CI for x # Lx = phat.profile(i=0, x=x) # Lx.plot() # Lx.get_bounds(alpha=0.2) # # CI for logSF=log(SF) # Lpr = phat.profile(i=0, logSF=log(R), link=phat.dist.link) # Lpr.plot() # Lpr.get_bounds(alpha=0.075) # # _WAFODIST.dlaplace.stats(0.8, loc=0) # pass # t = _WAFODIST.planck(0.51000000000000001) # t.ppf(0.5) # t = _WAFODIST.zipf(2) # t.ppf(0.5) # import pylab as plb # _WAFODIST.rice.rvs(1) # x = plb.linspace(-5, 5) # y = _WAFODIST.genpareto.cdf(x, 0) # plb.plot(x,y) # plb.show() # # # on = ones((2, 3)) # r = _WAFODIST.genpareto.rvs(0, size=100) # pht = _WAFODIST.genpareto.fit(r, 1, par_fix=[0, 0, nan]) # lp = pht.profile() if __name__ == '__main__': test1() # test_doctstrings()